Reference tutorial¶
https://www.sc-best-practices.org/introduction/prior_art.html
video:
Install environment¶
In terminal
conda create -n sc2024 python=3.9
conda activate sc2024
pip install notebook scanpy doubletdetection
change environment and start jupiter lab
Loading modules¶
import os # os 模块提供了一种便携的使用操作系统功能的方式
import scanpy as sc # scanpy 是一个用于单细胞 RNA 序列分析的高效工具包,广泛用于处理和分析大规模单细胞数据cimport seaborn as sns s # seaborn 是一个基于 matplotlib 的数据可视化库,提供了高度简化的绘图功能和美观的默认样nimport matplotlib.pyplot as plt # matplotlib.pyplot 是一个用于创建静态、动态和交互式图表的集合
import pandas as pd # pandas 是一个强大的数据处理和分析库,特别适用于处理结构化数据s pd
# 你提供的代码用于忽略 FutureWarning、UserWarning 和 RuntimeWarning 类型的警告。
# 这在处理数据时非常有用,特别是当你想要避免这些警告打扰正常工作流程时。
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)
Download and Prepare Data¶
GEO link: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE235063
# ! 符号用于执行系统命令
!mkdir data #创建一个data文件夹
!tar -xf data/GSE235063_RAW.tar -C data
ls data
驱动器 C 中的卷没有标签。
卷的序列号是 10D4-7984
C:\Users\Administrator\Desktop\test\data 的目录
2024-06-18 19:42 <DIR> .
2024-06-18 19:42 <DIR> ..
2024-06-18 17:10 7,544,156,160 GSE235063_RAW.tar
2023-06-16 01:08 29,751 GSM7494257_AML16_DX_processed_barcodes.tsv.gz
2023-06-16 01:08 87,866 GSM7494257_AML16_DX_processed_genes.tsv.gz
2023-06-16 01:08 49,880,893 GSM7494257_AML16_DX_processed_matrix.mtx.gz
2023-06-16 01:08 183,537 GSM7494257_AML16_DX_processed_metadata.tsv.gz
2023-06-16 01:08 19,367,602 GSM7494257_AML16_DX_raw_barcodes.tsv.gz
2023-06-16 01:08 254,821 GSM7494257_AML16_DX_raw_genes.tsv.gz
2023-06-16 01:09 86,702,202 GSM7494257_AML16_DX_raw_matrix.mtx.gz
2023-06-16 01:09 20,398 GSM7494258_AML16_REL_processed_barcodes.tsv.gz
2023-06-16 01:09 87,867 GSM7494258_AML16_REL_processed_genes.tsv.gz
2023-06-16 01:09 30,464,777 GSM7494258_AML16_REL_processed_matrix.mtx.gz
2023-06-16 01:09 121,775 GSM7494258_AML16_REL_processed_metadata.tsv.gz
2023-06-16 01:09 19,367,603 GSM7494258_AML16_REL_raw_barcodes.tsv.gz
2023-06-16 01:09 254,822 GSM7494258_AML16_REL_raw_genes.tsv.gz
2023-06-16 01:10 62,076,582 GSM7494258_AML16_REL_raw_matrix.mtx.gz
2023-06-16 01:10 23,047 GSM7494259_AML16_REM_processed_barcodes.tsv.gz
2023-06-16 01:10 87,867 GSM7494259_AML16_REM_processed_genes.tsv.gz
2023-06-16 01:10 33,922,410 GSM7494259_AML16_REM_processed_matrix.mtx.gz
2023-06-16 01:10 138,763 GSM7494259_AML16_REM_processed_metadata.tsv.gz
2023-06-16 01:10 19,367,603 GSM7494259_AML16_REM_raw_barcodes.tsv.gz
2023-06-16 01:10 254,822 GSM7494259_AML16_REM_raw_genes.tsv.gz
2023-06-16 01:10 56,851,099 GSM7494259_AML16_REM_raw_matrix.mtx.gz
2023-06-16 01:10 28,064 GSM7494260_AML6_DX_processed_barcodes.tsv.gz
2023-06-16 01:10 83,951 GSM7494260_AML6_DX_processed_genes.tsv.gz
2023-06-16 01:10 43,102,857 GSM7494260_AML6_DX_processed_matrix.mtx.gz
2023-06-16 01:10 170,376 GSM7494260_AML6_DX_processed_metadata.tsv.gz
2023-06-16 01:11 19,367,601 GSM7494260_AML6_DX_raw_barcodes.tsv.gz
2023-06-16 01:11 254,820 GSM7494260_AML6_DX_raw_genes.tsv.gz
2023-06-16 01:11 79,014,793 GSM7494260_AML6_DX_raw_matrix.mtx.gz
2023-06-16 01:11 23,921 GSM7494261_AML6_REL_processed_barcodes.tsv.gz
2023-06-16 01:11 83,952 GSM7494261_AML6_REL_processed_genes.tsv.gz
2023-06-16 01:11 38,350,741 GSM7494261_AML6_REL_processed_matrix.mtx.gz
2023-06-16 01:11 143,156 GSM7494261_AML6_REL_processed_metadata.tsv.gz
2023-06-16 01:11 19,367,602 GSM7494261_AML6_REL_raw_barcodes.tsv.gz
2023-06-16 01:11 254,821 GSM7494261_AML6_REL_raw_genes.tsv.gz
2023-06-16 01:12 65,441,183 GSM7494261_AML6_REL_raw_matrix.mtx.gz
2023-06-16 01:12 16,648 GSM7494262_AML6_REM_processed_barcodes.tsv.gz
2023-06-16 01:12 83,952 GSM7494262_AML6_REM_processed_genes.tsv.gz
2023-06-16 01:12 19,590,433 GSM7494262_AML6_REM_processed_matrix.mtx.gz
2023-06-16 01:12 98,210 GSM7494262_AML6_REM_processed_metadata.tsv.gz
2023-06-16 01:12 19,367,602 GSM7494262_AML6_REM_raw_barcodes.tsv.gz
2023-06-16 01:12 254,821 GSM7494262_AML6_REM_raw_genes.tsv.gz
2023-06-16 01:12 41,173,295 GSM7494262_AML6_REM_raw_matrix.mtx.gz
2023-06-16 01:12 22,227 GSM7494263_AML2_DX_processed_barcodes.tsv.gz
2023-06-16 01:12 85,538 GSM7494263_AML2_DX_processed_genes.tsv.gz
2023-06-16 01:12 31,590,399 GSM7494263_AML2_DX_processed_matrix.mtx.gz
2023-06-16 01:12 133,746 GSM7494263_AML2_DX_processed_metadata.tsv.gz
2023-06-16 01:12 19,367,601 GSM7494263_AML2_DX_raw_barcodes.tsv.gz
2023-06-16 01:12 254,820 GSM7494263_AML2_DX_raw_genes.tsv.gz
2023-06-16 01:13 49,031,697 GSM7494263_AML2_DX_raw_matrix.mtx.gz
2023-06-16 01:13 24,812 GSM7494264_AML2_REL_processed_barcodes.tsv.gz
2023-06-16 01:13 85,539 GSM7494264_AML2_REL_processed_genes.tsv.gz
2023-06-16 01:13 38,593,511 GSM7494264_AML2_REL_processed_matrix.mtx.gz
2023-06-16 01:13 150,329 GSM7494264_AML2_REL_processed_metadata.tsv.gz
2023-06-16 01:13 19,367,602 GSM7494264_AML2_REL_raw_barcodes.tsv.gz
2023-06-16 01:13 254,821 GSM7494264_AML2_REL_raw_genes.tsv.gz
2023-06-16 01:13 58,768,408 GSM7494264_AML2_REL_raw_matrix.mtx.gz
2023-06-16 01:13 9,096 GSM7494265_AML2_REM_processed_barcodes.tsv.gz
2023-06-16 01:13 85,539 GSM7494265_AML2_REM_processed_genes.tsv.gz
2023-06-16 01:13 13,625,454 GSM7494265_AML2_REM_processed_matrix.mtx.gz
2023-06-16 01:13 52,148 GSM7494265_AML2_REM_processed_metadata.tsv.gz
2023-06-16 01:14 19,367,602 GSM7494265_AML2_REM_raw_barcodes.tsv.gz
2023-06-16 01:14 254,821 GSM7494265_AML2_REM_raw_genes.tsv.gz
2023-06-16 01:14 27,780,603 GSM7494265_AML2_REM_raw_matrix.mtx.gz
2023-06-16 01:14 26,757 GSM7494266_AML15_DX_processed_barcodes.tsv.gz
2023-06-16 01:14 83,451 GSM7494266_AML15_DX_processed_genes.tsv.gz
2023-06-16 01:14 35,485,678 GSM7494266_AML15_DX_processed_matrix.mtx.gz
2023-06-16 01:14 164,393 GSM7494266_AML15_DX_processed_metadata.tsv.gz
2023-06-16 01:14 19,367,602 GSM7494266_AML15_DX_raw_barcodes.tsv.gz
2023-06-16 01:14 254,821 GSM7494266_AML15_DX_raw_genes.tsv.gz
2023-06-16 01:14 56,631,932 GSM7494266_AML15_DX_raw_matrix.mtx.gz
2023-06-16 01:14 23,621 GSM7494267_AML15_REL_processed_barcodes.tsv.gz
2023-06-16 01:14 83,452 GSM7494267_AML15_REL_processed_genes.tsv.gz
2023-06-16 01:14 36,754,915 GSM7494267_AML15_REL_processed_matrix.mtx.gz
2023-06-16 01:14 144,466 GSM7494267_AML15_REL_processed_metadata.tsv.gz
2023-06-16 01:15 19,367,603 GSM7494267_AML15_REL_raw_barcodes.tsv.gz
2023-06-16 01:15 254,822 GSM7494267_AML15_REL_raw_genes.tsv.gz
2023-06-16 01:15 57,113,401 GSM7494267_AML15_REL_raw_matrix.mtx.gz
2023-06-16 01:15 8,980 GSM7494268_AML15_REM_processed_barcodes.tsv.gz
2023-06-16 01:15 83,452 GSM7494268_AML15_REM_processed_genes.tsv.gz
2023-06-16 01:15 11,162,022 GSM7494268_AML15_REM_processed_matrix.mtx.gz
2023-06-16 01:15 51,054 GSM7494268_AML15_REM_processed_metadata.tsv.gz
2023-06-16 01:15 19,367,603 GSM7494268_AML15_REM_raw_barcodes.tsv.gz
2023-06-16 01:15 254,822 GSM7494268_AML15_REM_raw_genes.tsv.gz
2023-06-16 01:15 22,627,500 GSM7494268_AML15_REM_raw_matrix.mtx.gz
2023-06-16 01:15 36,658 GSM7494269_AML3_DX_processed_barcodes.tsv.gz
2023-06-16 01:15 83,598 GSM7494269_AML3_DX_processed_genes.tsv.gz
2023-06-16 01:16 63,540,161 GSM7494269_AML3_DX_processed_matrix.mtx.gz
2023-06-16 01:16 229,153 GSM7494269_AML3_DX_processed_metadata.tsv.gz
2023-06-16 01:16 19,367,601 GSM7494269_AML3_DX_raw_barcodes.tsv.gz
2023-06-16 01:16 254,820 GSM7494269_AML3_DX_raw_genes.tsv.gz
2023-06-16 01:16 84,995,686 GSM7494269_AML3_DX_raw_matrix.mtx.gz
2023-06-16 01:16 12,787 GSM7494270_AML3_REM_processed_barcodes.tsv.gz
2023-06-16 01:16 83,599 GSM7494270_AML3_REM_processed_genes.tsv.gz
2023-06-16 01:16 14,219,196 GSM7494270_AML3_REM_processed_matrix.mtx.gz
2023-06-16 01:16 74,567 GSM7494270_AML3_REM_processed_metadata.tsv.gz
2023-06-16 01:16 19,367,602 GSM7494270_AML3_REM_raw_barcodes.tsv.gz
2023-06-16 01:16 254,821 GSM7494270_AML3_REM_raw_genes.tsv.gz
2023-06-16 01:16 26,216,012 GSM7494270_AML3_REM_raw_matrix.mtx.gz
2023-06-16 01:16 317 GSM7494271_AML7_DX_processed_barcodes.tsv.gz
2023-06-16 01:16 76,073 GSM7494271_AML7_DX_processed_genes.tsv.gz
2023-06-16 01:16 159,133 GSM7494271_AML7_DX_processed_matrix.mtx.gz
2023-06-16 01:16 1,596 GSM7494271_AML7_DX_processed_metadata.tsv.gz
2023-06-16 01:17 19,367,601 GSM7494271_AML7_DX_raw_barcodes.tsv.gz
2023-06-16 01:17 254,820 GSM7494271_AML7_DX_raw_genes.tsv.gz
2023-06-16 01:17 9,022,172 GSM7494271_AML7_DX_raw_matrix.mtx.gz
2023-06-16 01:17 11,530 GSM7494272_AML7_REL_processed_barcodes.tsv.gz
2023-06-16 01:17 76,074 GSM7494272_AML7_REL_processed_genes.tsv.gz
2023-06-16 01:17 9,654,849 GSM7494272_AML7_REL_processed_matrix.mtx.gz
2023-06-16 01:17 65,123 GSM7494272_AML7_REL_processed_metadata.tsv.gz
2023-06-16 01:17 19,367,602 GSM7494272_AML7_REL_raw_barcodes.tsv.gz
2023-06-16 01:17 254,821 GSM7494272_AML7_REL_raw_genes.tsv.gz
2023-06-16 01:17 20,279,967 GSM7494272_AML7_REL_raw_matrix.mtx.gz
2023-06-16 01:17 10,759 GSM7494273_AML7_REM_processed_barcodes.tsv.gz
2023-06-16 01:17 76,074 GSM7494273_AML7_REM_processed_genes.tsv.gz
2023-06-16 01:17 10,413,802 GSM7494273_AML7_REM_processed_matrix.mtx.gz
2023-06-16 01:17 61,428 GSM7494273_AML7_REM_processed_metadata.tsv.gz
2023-06-16 01:17 19,367,602 GSM7494273_AML7_REM_raw_barcodes.tsv.gz
2023-06-16 01:17 254,821 GSM7494273_AML7_REM_raw_genes.tsv.gz
2023-06-16 01:17 19,064,329 GSM7494273_AML7_REM_raw_matrix.mtx.gz
2023-06-16 01:17 20,032 GSM7494274_AML8_DX_processed_barcodes.tsv.gz
2023-06-16 01:17 80,089 GSM7494274_AML8_DX_processed_genes.tsv.gz
2023-06-16 01:18 21,122,226 GSM7494274_AML8_DX_processed_matrix.mtx.gz
2023-06-16 01:18 118,859 GSM7494274_AML8_DX_processed_metadata.tsv.gz
2023-06-16 01:18 19,367,601 GSM7494274_AML8_DX_raw_barcodes.tsv.gz
2023-06-16 01:18 254,820 GSM7494274_AML8_DX_raw_genes.tsv.gz
2023-06-16 01:18 50,099,759 GSM7494274_AML8_DX_raw_matrix.mtx.gz
2023-06-16 01:18 23,199 GSM7494275_AML8_REL_processed_barcodes.tsv.gz
2023-06-16 01:18 80,090 GSM7494275_AML8_REL_processed_genes.tsv.gz
2023-06-16 01:18 28,644,068 GSM7494275_AML8_REL_processed_matrix.mtx.gz
2023-06-16 01:18 139,219 GSM7494275_AML8_REL_processed_metadata.tsv.gz
2023-06-16 01:18 19,367,602 GSM7494275_AML8_REL_raw_barcodes.tsv.gz
2023-06-16 01:18 254,821 GSM7494275_AML8_REL_raw_genes.tsv.gz
2023-06-16 01:18 44,520,290 GSM7494275_AML8_REL_raw_matrix.mtx.gz
2023-06-16 01:18 14,314 GSM7494276_AML8_REM_processed_barcodes.tsv.gz
2023-06-16 01:18 80,090 GSM7494276_AML8_REM_processed_genes.tsv.gz
2023-06-16 01:19 14,431,401 GSM7494276_AML8_REM_processed_matrix.mtx.gz
2023-06-16 01:19 82,740 GSM7494276_AML8_REM_processed_metadata.tsv.gz
2023-06-16 01:19 19,367,602 GSM7494276_AML8_REM_raw_barcodes.tsv.gz
2023-06-16 01:19 254,821 GSM7494276_AML8_REM_raw_genes.tsv.gz
2023-06-16 01:19 25,272,238 GSM7494276_AML8_REM_raw_matrix.mtx.gz
2023-06-16 01:19 28,065 GSM7494277_AML20_DX_processed_barcodes.tsv.gz
2023-06-16 01:19 79,460 GSM7494277_AML20_DX_processed_genes.tsv.gz
2023-06-16 01:19 45,826,895 GSM7494277_AML20_DX_processed_matrix.mtx.gz
2023-06-16 01:19 172,104 GSM7494277_AML20_DX_processed_metadata.tsv.gz
2023-06-16 01:19 19,367,602 GSM7494277_AML20_DX_raw_barcodes.tsv.gz
2023-06-16 01:19 254,821 GSM7494277_AML20_DX_raw_genes.tsv.gz
2023-06-16 01:20 60,089,573 GSM7494277_AML20_DX_raw_matrix.mtx.gz
2023-06-16 01:20 9,787 GSM7494278_AML20_REM_processed_barcodes.tsv.gz
2023-06-16 01:20 79,461 GSM7494278_AML20_REM_processed_genes.tsv.gz
2023-06-16 01:20 10,555,065 GSM7494278_AML20_REM_processed_matrix.mtx.gz
2023-06-16 01:20 55,656 GSM7494278_AML20_REM_processed_metadata.tsv.gz
2023-06-16 01:20 19,367,603 GSM7494278_AML20_REM_raw_barcodes.tsv.gz
2023-06-16 01:20 254,822 GSM7494278_AML20_REM_raw_genes.tsv.gz
2023-06-16 01:20 26,739,671 GSM7494278_AML20_REM_raw_matrix.mtx.gz
2023-06-16 01:20 23,563 GSM7494279_AML5_DX_processed_barcodes.tsv.gz
2023-06-16 01:20 83,571 GSM7494279_AML5_DX_processed_genes.tsv.gz
2023-06-16 01:20 36,819,972 GSM7494279_AML5_DX_processed_matrix.mtx.gz
2023-06-16 01:20 141,912 GSM7494279_AML5_DX_processed_metadata.tsv.gz
2023-06-16 01:20 19,367,601 GSM7494279_AML5_DX_raw_barcodes.tsv.gz
2023-06-16 01:20 254,820 GSM7494279_AML5_DX_raw_genes.tsv.gz
2023-06-16 01:21 58,892,249 GSM7494279_AML5_DX_raw_matrix.mtx.gz
2023-06-16 01:21 19,306 GSM7494280_AML5_REL_processed_barcodes.tsv.gz
2023-06-16 01:21 83,572 GSM7494280_AML5_REL_processed_genes.tsv.gz
2023-06-16 01:21 28,094,035 GSM7494280_AML5_REL_processed_matrix.mtx.gz
2023-06-16 01:21 113,413 GSM7494280_AML5_REL_processed_metadata.tsv.gz
2023-06-16 01:21 19,367,602 GSM7494280_AML5_REL_raw_barcodes.tsv.gz
2023-06-16 01:21 254,821 GSM7494280_AML5_REL_raw_genes.tsv.gz
2023-06-16 01:21 48,609,297 GSM7494280_AML5_REL_raw_matrix.mtx.gz
2023-06-16 01:21 18,495 GSM7494281_AML5_REM_processed_barcodes.tsv.gz
2023-06-16 01:21 83,572 GSM7494281_AML5_REM_processed_genes.tsv.gz
2023-06-16 01:21 19,938,386 GSM7494281_AML5_REM_processed_matrix.mtx.gz
2023-06-16 01:21 109,053 GSM7494281_AML5_REM_processed_metadata.tsv.gz
2023-06-16 01:21 19,367,602 GSM7494281_AML5_REM_raw_barcodes.tsv.gz
2023-06-16 01:21 254,821 GSM7494281_AML5_REM_raw_genes.tsv.gz
2023-06-16 01:22 33,901,580 GSM7494281_AML5_REM_raw_matrix.mtx.gz
2023-06-16 01:22 30,249 GSM7494282_AML17_DX_processed_barcodes.tsv.gz
2023-06-16 01:22 79,270 GSM7494282_AML17_DX_processed_genes.tsv.gz
2023-06-16 01:22 42,822,120 GSM7494282_AML17_DX_processed_matrix.mtx.gz
2023-06-16 01:22 185,029 GSM7494282_AML17_DX_processed_metadata.tsv.gz
2023-06-16 01:22 19,367,602 GSM7494282_AML17_DX_raw_barcodes.tsv.gz
2023-06-16 01:22 254,821 GSM7494282_AML17_DX_raw_genes.tsv.gz
2023-06-16 01:22 61,341,998 GSM7494282_AML17_DX_raw_matrix.mtx.gz
2023-06-16 01:22 39,213 GSM7494283_AML17_REL_processed_barcodes.tsv.gz
2023-06-16 01:22 79,271 GSM7494283_AML17_REL_processed_genes.tsv.gz
2023-06-16 01:22 37,678,921 GSM7494283_AML17_REL_processed_matrix.mtx.gz
2023-06-16 01:22 241,897 GSM7494283_AML17_REL_processed_metadata.tsv.gz
2023-06-16 01:23 19,367,603 GSM7494283_AML17_REL_raw_barcodes.tsv.gz
2023-06-16 01:23 254,822 GSM7494283_AML17_REL_raw_genes.tsv.gz
2023-06-16 01:23 61,044,197 GSM7494283_AML17_REL_raw_matrix.mtx.gz
2023-06-16 01:23 17,187 GSM7494284_AML1_REM_processed_barcodes.tsv.gz
2023-06-16 01:23 84,014 GSM7494284_AML1_REM_processed_genes.tsv.gz
2023-06-16 01:23 28,484,540 GSM7494284_AML1_REM_processed_matrix.mtx.gz
2023-06-16 01:23 102,693 GSM7494284_AML1_REM_processed_metadata.tsv.gz
2023-06-16 01:23 19,367,602 GSM7494284_AML1_REM_raw_barcodes.tsv.gz
2023-06-16 01:23 254,821 GSM7494284_AML1_REM_raw_genes.tsv.gz
2023-06-16 01:24 59,182,014 GSM7494284_AML1_REM_raw_matrix.mtx.gz
2023-06-16 01:24 12,319 GSM7494285_AML1_DX_processed_barcodes.tsv.gz
2023-06-16 01:24 84,013 GSM7494285_AML1_DX_processed_genes.tsv.gz
2023-06-16 01:24 19,023,140 GSM7494285_AML1_DX_processed_matrix.mtx.gz
2023-06-16 01:24 72,158 GSM7494285_AML1_DX_processed_metadata.tsv.gz
2023-06-16 01:24 19,367,601 GSM7494285_AML1_DX_raw_barcodes.tsv.gz
2023-06-16 01:24 254,820 GSM7494285_AML1_DX_raw_genes.tsv.gz
2023-06-16 01:24 37,294,389 GSM7494285_AML1_DX_raw_matrix.mtx.gz
2023-06-16 01:24 18,212 GSM7494286_AML27_DX_processed_barcodes.tsv.gz
2023-06-16 01:24 84,496 GSM7494286_AML27_DX_processed_genes.tsv.gz
2023-06-16 01:24 29,578,430 GSM7494286_AML27_DX_processed_matrix.mtx.gz
2023-06-16 01:24 108,535 GSM7494286_AML27_DX_processed_metadata.tsv.gz
2023-06-16 01:24 19,367,602 GSM7494286_AML27_DX_raw_barcodes.tsv.gz
2023-06-16 01:24 254,821 GSM7494286_AML27_DX_raw_genes.tsv.gz
2023-06-16 01:25 47,395,430 GSM7494286_AML27_DX_raw_matrix.mtx.gz
2023-06-16 01:25 20,200 GSM7494287_AML27_REL_processed_barcodes.tsv.gz
2023-06-16 01:25 84,497 GSM7494287_AML27_REL_processed_genes.tsv.gz
2023-06-16 01:25 32,430,945 GSM7494287_AML27_REL_processed_matrix.mtx.gz
2023-06-16 01:25 122,288 GSM7494287_AML27_REL_processed_metadata.tsv.gz
2023-06-16 01:25 19,367,603 GSM7494287_AML27_REL_raw_barcodes.tsv.gz
2023-06-16 01:25 254,822 GSM7494287_AML27_REL_raw_genes.tsv.gz
2023-06-16 01:25 53,159,637 GSM7494287_AML27_REL_raw_matrix.mtx.gz
2023-06-16 01:25 29,374 GSM7494288_AML27_REM_processed_barcodes.tsv.gz
2023-06-16 01:25 84,497 GSM7494288_AML27_REM_processed_genes.tsv.gz
2023-06-16 01:25 36,260,650 GSM7494288_AML27_REM_processed_matrix.mtx.gz
2023-06-16 01:25 178,981 GSM7494288_AML27_REM_processed_metadata.tsv.gz
2023-06-16 01:26 19,367,603 GSM7494288_AML27_REM_raw_barcodes.tsv.gz
2023-06-16 01:26 254,822 GSM7494288_AML27_REM_raw_genes.tsv.gz
2023-06-16 01:26 54,699,393 GSM7494288_AML27_REM_raw_matrix.mtx.gz
2023-06-16 01:26 34,023 GSM7494289_AML9_DX_processed_barcodes.tsv.gz
2023-06-16 01:26 81,973 GSM7494289_AML9_DX_processed_genes.tsv.gz
2023-06-16 01:26 61,149,166 GSM7494289_AML9_DX_processed_matrix.mtx.gz
2023-06-16 01:26 209,615 GSM7494289_AML9_DX_processed_metadata.tsv.gz
2023-06-16 01:26 19,367,601 GSM7494289_AML9_DX_raw_barcodes.tsv.gz
2023-06-16 01:26 254,820 GSM7494289_AML9_DX_raw_genes.tsv.gz
2023-06-16 01:27 88,960,064 GSM7494289_AML9_DX_raw_matrix.mtx.gz
2023-06-16 01:27 20,103 GSM7494290_AML9_REL_processed_barcodes.tsv.gz
2023-06-16 01:27 81,974 GSM7494290_AML9_REL_processed_genes.tsv.gz
2023-06-16 01:27 18,011,144 GSM7494290_AML9_REL_processed_matrix.mtx.gz
2023-06-16 01:27 117,266 GSM7494290_AML9_REL_processed_metadata.tsv.gz
2023-06-16 01:27 19,367,602 GSM7494290_AML9_REL_raw_barcodes.tsv.gz
2023-06-16 01:27 254,821 GSM7494290_AML9_REL_raw_genes.tsv.gz
2023-06-16 01:27 34,459,847 GSM7494290_AML9_REL_raw_matrix.mtx.gz
2023-06-16 01:27 13,146 GSM7494291_AML9_REM_processed_barcodes.tsv.gz
2023-06-16 01:27 81,974 GSM7494291_AML9_REM_processed_genes.tsv.gz
2023-06-16 01:27 14,660,074 GSM7494291_AML9_REM_processed_matrix.mtx.gz
2023-06-16 01:27 75,343 GSM7494291_AML9_REM_processed_metadata.tsv.gz
2023-06-16 01:27 19,367,602 GSM7494291_AML9_REM_raw_barcodes.tsv.gz
2023-06-16 01:27 254,821 GSM7494291_AML9_REM_raw_genes.tsv.gz
2023-06-16 01:28 30,331,894 GSM7494291_AML9_REM_raw_matrix.mtx.gz
2023-06-16 01:28 23,701 GSM7494292_AML10_DX_processed_barcodes.tsv.gz
2023-06-16 01:28 85,288 GSM7494292_AML10_DX_processed_genes.tsv.gz
2023-06-16 01:28 39,537,523 GSM7494292_AML10_DX_processed_matrix.mtx.gz
2023-06-16 01:28 143,906 GSM7494292_AML10_DX_processed_metadata.tsv.gz
2023-06-16 01:28 19,367,602 GSM7494292_AML10_DX_raw_barcodes.tsv.gz
2023-06-16 01:28 254,821 GSM7494292_AML10_DX_raw_genes.tsv.gz
2023-06-16 01:28 60,180,315 GSM7494292_AML10_DX_raw_matrix.mtx.gz
2023-06-16 01:28 12,387 GSM7494293_AML10_REL_processed_barcodes.tsv.gz
2023-06-16 01:28 85,289 GSM7494293_AML10_REL_processed_genes.tsv.gz
2023-06-16 01:28 16,136,041 GSM7494293_AML10_REL_processed_matrix.mtx.gz
2023-06-16 01:28 70,838 GSM7494293_AML10_REL_processed_metadata.tsv.gz
2023-06-16 01:28 19,367,603 GSM7494293_AML10_REL_raw_barcodes.tsv.gz
2023-06-16 01:28 254,822 GSM7494293_AML10_REL_raw_genes.tsv.gz
2023-06-16 01:29 47,428,946 GSM7494293_AML10_REL_raw_matrix.mtx.gz
2023-06-16 01:29 12,852 GSM7494294_AML10_REM_processed_barcodes.tsv.gz
2023-06-16 01:29 85,289 GSM7494294_AML10_REM_processed_genes.tsv.gz
2023-06-16 01:29 16,994,640 GSM7494294_AML10_REM_processed_matrix.mtx.gz
2023-06-16 01:29 75,569 GSM7494294_AML10_REM_processed_metadata.tsv.gz
2023-06-16 01:29 19,367,603 GSM7494294_AML10_REM_raw_barcodes.tsv.gz
2023-06-16 01:29 254,822 GSM7494294_AML10_REM_raw_genes.tsv.gz
2023-06-16 01:29 31,032,102 GSM7494294_AML10_REM_raw_matrix.mtx.gz
2023-06-16 01:29 31,983 GSM7494295_AML11_DX_processed_barcodes.tsv.gz
2023-06-16 01:29 85,488 GSM7494295_AML11_DX_processed_genes.tsv.gz
2023-06-16 01:29 50,814,144 GSM7494295_AML11_DX_processed_matrix.mtx.gz
2023-06-16 01:29 196,371 GSM7494295_AML11_DX_processed_metadata.tsv.gz
2023-06-16 01:30 19,367,602 GSM7494295_AML11_DX_raw_barcodes.tsv.gz
2023-06-16 01:30 254,821 GSM7494295_AML11_DX_raw_genes.tsv.gz
2023-06-16 01:30 71,519,175 GSM7494295_AML11_DX_raw_matrix.mtx.gz
2023-06-16 01:30 36,074 GSM7494296_AML11_REL_processed_barcodes.tsv.gz
2023-06-16 01:30 85,489 GSM7494296_AML11_REL_processed_genes.tsv.gz
2023-06-16 01:30 31,188,583 GSM7494296_AML11_REL_processed_matrix.mtx.gz
2023-06-16 01:30 217,837 GSM7494296_AML11_REL_processed_metadata.tsv.gz
2023-06-16 01:30 19,367,603 GSM7494296_AML11_REL_raw_barcodes.tsv.gz
2023-06-16 01:30 254,822 GSM7494296_AML11_REL_raw_genes.tsv.gz
2023-06-16 01:30 46,608,949 GSM7494296_AML11_REL_raw_matrix.mtx.gz
2023-06-16 01:30 24,672 GSM7494297_AML11_REM_processed_barcodes.tsv.gz
2023-06-16 01:30 85,489 GSM7494297_AML11_REM_processed_genes.tsv.gz
2023-06-16 01:31 32,305,699 GSM7494297_AML11_REM_processed_matrix.mtx.gz
2023-06-16 01:31 148,182 GSM7494297_AML11_REM_processed_metadata.tsv.gz
2023-06-16 01:31 19,367,603 GSM7494297_AML11_REM_raw_barcodes.tsv.gz
2023-06-16 01:31 254,822 GSM7494297_AML11_REM_raw_genes.tsv.gz
2023-06-16 01:31 50,839,803 GSM7494297_AML11_REM_raw_matrix.mtx.gz
2023-06-16 01:31 21,439 GSM7494298_AML4_DX_processed_barcodes.tsv.gz
2023-06-16 01:31 83,468 GSM7494298_AML4_DX_processed_genes.tsv.gz
2023-06-16 01:31 42,796,240 GSM7494298_AML4_DX_processed_matrix.mtx.gz
2023-06-16 01:31 129,642 GSM7494298_AML4_DX_processed_metadata.tsv.gz
2023-06-16 01:31 19,367,601 GSM7494298_AML4_DX_raw_barcodes.tsv.gz
2023-06-16 01:31 254,820 GSM7494298_AML4_DX_raw_genes.tsv.gz
2023-06-16 01:32 65,577,474 GSM7494298_AML4_DX_raw_matrix.mtx.gz
2023-06-16 01:32 25,022 GSM7494299_AML4_REL_processed_barcodes.tsv.gz
2023-06-16 01:32 83,469 GSM7494299_AML4_REL_processed_genes.tsv.gz
2023-06-16 01:32 50,700,292 GSM7494299_AML4_REL_processed_matrix.mtx.gz
2023-06-16 01:32 153,530 GSM7494299_AML4_REL_processed_metadata.tsv.gz
2023-06-16 01:32 19,367,602 GSM7494299_AML4_REL_raw_barcodes.tsv.gz
2023-06-16 01:32 254,821 GSM7494299_AML4_REL_raw_genes.tsv.gz
2023-06-16 01:32 73,283,528 GSM7494299_AML4_REL_raw_matrix.mtx.gz
2023-06-16 01:32 34,124 GSM7494300_AML22_DX_processed_barcodes.tsv.gz
2023-06-16 01:32 80,339 GSM7494300_AML22_DX_processed_genes.tsv.gz
2023-06-16 01:33 37,611,467 GSM7494300_AML22_DX_processed_matrix.mtx.gz
2023-06-16 01:33 210,383 GSM7494300_AML22_DX_processed_metadata.tsv.gz
2023-06-16 01:33 19,367,602 GSM7494300_AML22_DX_raw_barcodes.tsv.gz
2023-06-16 01:33 254,821 GSM7494300_AML22_DX_raw_genes.tsv.gz
2023-06-16 01:33 60,746,924 GSM7494300_AML22_DX_raw_matrix.mtx.gz
2023-06-16 01:33 17,189 GSM7494301_AML22_REL_processed_barcodes.tsv.gz
2023-06-16 01:33 80,340 GSM7494301_AML22_REL_processed_genes.tsv.gz
2023-06-16 01:33 16,384,657 GSM7494301_AML22_REL_processed_matrix.mtx.gz
2023-06-16 01:33 99,385 GSM7494301_AML22_REL_processed_metadata.tsv.gz
2023-06-16 01:33 19,367,603 GSM7494301_AML22_REL_raw_barcodes.tsv.gz
2023-06-16 01:33 254,822 GSM7494301_AML22_REL_raw_genes.tsv.gz
2023-06-16 01:33 36,847,384 GSM7494301_AML22_REL_raw_matrix.mtx.gz
2023-06-16 01:33 14,335 GSM7494302_AML22_REM_processed_barcodes.tsv.gz
2023-06-16 01:33 80,340 GSM7494302_AML22_REM_processed_genes.tsv.gz
2023-06-16 01:34 9,698,068 GSM7494302_AML22_REM_processed_matrix.mtx.gz
2023-06-16 01:34 81,471 GSM7494302_AML22_REM_processed_metadata.tsv.gz
2023-06-16 01:34 19,367,603 GSM7494302_AML22_REM_raw_barcodes.tsv.gz
2023-06-16 01:34 254,822 GSM7494302_AML22_REM_raw_genes.tsv.gz
2023-06-16 01:34 23,500,627 GSM7494302_AML22_REM_raw_matrix.mtx.gz
2023-06-16 01:34 18,349 GSM7494303_AML21_DX_processed_barcodes.tsv.gz
2023-06-16 01:34 81,681 GSM7494303_AML21_DX_processed_genes.tsv.gz
2023-06-16 01:34 21,737,585 GSM7494303_AML21_DX_processed_matrix.mtx.gz
2023-06-16 01:34 108,684 GSM7494303_AML21_DX_processed_metadata.tsv.gz
2023-06-16 01:34 19,367,602 GSM7494303_AML21_DX_raw_barcodes.tsv.gz
2023-06-16 01:34 254,821 GSM7494303_AML21_DX_raw_genes.tsv.gz
2023-06-16 01:34 36,941,956 GSM7494303_AML21_DX_raw_matrix.mtx.gz
2023-06-16 01:34 23,035 GSM7494304_AML21_REL_processed_barcodes.tsv.gz
2023-06-16 01:34 81,682 GSM7494304_AML21_REL_processed_genes.tsv.gz
2023-06-16 01:34 38,690,638 GSM7494304_AML21_REL_processed_matrix.mtx.gz
2023-06-16 01:35 139,497 GSM7494304_AML21_REL_processed_metadata.tsv.gz
2023-06-16 01:35 19,367,603 GSM7494304_AML21_REL_raw_barcodes.tsv.gz
2023-06-16 01:35 254,822 GSM7494304_AML21_REL_raw_genes.tsv.gz
2023-06-16 01:35 65,996,782 GSM7494304_AML21_REL_raw_matrix.mtx.gz
2023-06-16 01:35 12,158 GSM7494305_AML21_REM_processed_barcodes.tsv.gz
2023-06-16 01:35 81,682 GSM7494305_AML21_REM_processed_genes.tsv.gz
2023-06-16 01:35 10,315,955 GSM7494305_AML21_REM_processed_matrix.mtx.gz
2023-06-16 01:35 69,918 GSM7494305_AML21_REM_processed_metadata.tsv.gz
2023-06-16 01:35 19,367,603 GSM7494305_AML21_REM_raw_barcodes.tsv.gz
2023-06-16 01:35 254,822 GSM7494305_AML21_REM_raw_genes.tsv.gz
2023-06-16 01:35 21,874,263 GSM7494305_AML21_REM_raw_matrix.mtx.gz
2023-06-16 01:35 30,641 GSM7494306_AML24_DX_processed_barcodes.tsv.gz
2023-06-16 01:35 79,707 GSM7494306_AML24_DX_processed_genes.tsv.gz
2023-06-16 01:36 49,847,491 GSM7494306_AML24_DX_processed_matrix.mtx.gz
2023-06-16 01:36 189,649 GSM7494306_AML24_DX_processed_metadata.tsv.gz
2023-06-16 01:36 19,367,602 GSM7494306_AML24_DX_raw_barcodes.tsv.gz
2023-06-16 01:36 254,821 GSM7494306_AML24_DX_raw_genes.tsv.gz
2023-06-16 01:36 73,581,135 GSM7494306_AML24_DX_raw_matrix.mtx.gz
2023-06-16 01:36 6,104 GSM7494307_AML24_REL_processed_barcodes.tsv.gz
2023-06-16 01:36 79,708 GSM7494307_AML24_REL_processed_genes.tsv.gz
2023-06-16 01:36 4,367,879 GSM7494307_AML24_REL_processed_matrix.mtx.gz
2023-06-16 01:36 33,457 GSM7494307_AML24_REL_processed_metadata.tsv.gz
2023-06-16 01:36 19,367,603 GSM7494307_AML24_REL_raw_barcodes.tsv.gz
2023-06-16 01:36 254,822 GSM7494307_AML24_REL_raw_genes.tsv.gz
2023-06-16 01:36 12,124,445 GSM7494307_AML24_REL_raw_matrix.mtx.gz
2023-06-16 01:36 11,568 GSM7494308_AML24_REM_processed_barcodes.tsv.gz
2023-06-16 01:36 79,708 GSM7494308_AML24_REM_processed_genes.tsv.gz
2023-06-16 01:36 6,792,995 GSM7494308_AML24_REM_processed_matrix.mtx.gz
2023-06-16 01:36 64,376 GSM7494308_AML24_REM_processed_metadata.tsv.gz
2023-06-16 01:37 19,367,603 GSM7494308_AML24_REM_raw_barcodes.tsv.gz
2023-06-16 01:37 254,822 GSM7494308_AML24_REM_raw_genes.tsv.gz
2023-06-16 01:37 19,158,615 GSM7494308_AML24_REM_raw_matrix.mtx.gz
2023-06-16 01:37 7,225 GSM7494309_AML23_DX_processed_barcodes.tsv.gz
2023-06-16 01:37 80,901 GSM7494309_AML23_DX_processed_genes.tsv.gz
2023-06-16 01:37 5,889,839 GSM7494309_AML23_DX_processed_matrix.mtx.gz
2023-06-16 01:37 39,458 GSM7494309_AML23_DX_processed_metadata.tsv.gz
2023-06-16 01:37 19,367,602 GSM7494309_AML23_DX_raw_barcodes.tsv.gz
2023-06-16 01:37 254,821 GSM7494309_AML23_DX_raw_genes.tsv.gz
2023-06-16 01:37 16,014,558 GSM7494309_AML23_DX_raw_matrix.mtx.gz
2023-06-16 01:37 34,830 GSM7494310_AML23_REL_processed_barcodes.tsv.gz
2023-06-16 01:37 80,902 GSM7494310_AML23_REL_processed_genes.tsv.gz
2023-06-16 01:37 58,387,598 GSM7494310_AML23_REL_processed_matrix.mtx.gz
2023-06-16 01:37 217,147 GSM7494310_AML23_REL_processed_metadata.tsv.gz
2023-06-16 01:37 19,367,603 GSM7494310_AML23_REL_raw_barcodes.tsv.gz
2023-06-16 01:37 254,822 GSM7494310_AML23_REL_raw_genes.tsv.gz
2023-06-16 01:38 83,333,215 GSM7494310_AML23_REL_raw_matrix.mtx.gz
2023-06-16 01:38 28,020 GSM7494311_AML23_REM_processed_barcodes.tsv.gz
2023-06-16 01:38 80,902 GSM7494311_AML23_REM_processed_genes.tsv.gz
2023-06-16 01:38 30,724,433 GSM7494311_AML23_REM_processed_matrix.mtx.gz
2023-06-16 01:38 170,279 GSM7494311_AML23_REM_processed_metadata.tsv.gz
2023-06-16 01:38 19,367,603 GSM7494311_AML23_REM_raw_barcodes.tsv.gz
2023-06-16 01:38 254,822 GSM7494311_AML23_REM_raw_genes.tsv.gz
2023-06-16 01:38 52,166,436 GSM7494311_AML23_REM_raw_matrix.mtx.gz
2023-06-16 01:38 29,063 GSM7494312_AML28_REL_processed_barcodes.tsv.gz
2023-06-16 01:38 79,976 GSM7494312_AML28_REL_processed_genes.tsv.gz
2023-06-16 01:39 39,487,825 GSM7494312_AML28_REL_processed_matrix.mtx.gz
2023-06-16 01:39 178,547 GSM7494312_AML28_REL_processed_metadata.tsv.gz
2023-06-16 01:39 19,367,603 GSM7494312_AML28_REL_raw_barcodes.tsv.gz
2023-06-16 01:39 254,822 GSM7494312_AML28_REL_raw_genes.tsv.gz
2023-06-16 01:39 55,665,343 GSM7494312_AML28_REL_raw_matrix.mtx.gz
2023-06-16 01:39 2,804 GSM7494313_AML28_REM_processed_barcodes.tsv.gz
2023-06-16 01:39 79,976 GSM7494313_AML28_REM_processed_genes.tsv.gz
2023-06-16 01:39 3,198,093 GSM7494313_AML28_REM_processed_matrix.mtx.gz
2023-06-16 01:39 14,850 GSM7494313_AML28_REM_processed_metadata.tsv.gz
2023-06-16 01:39 19,367,603 GSM7494313_AML28_REM_raw_barcodes.tsv.gz
2023-06-16 01:39 254,822 GSM7494313_AML28_REM_raw_genes.tsv.gz
2023-06-16 01:39 14,166,077 GSM7494313_AML28_REM_raw_matrix.mtx.gz
2023-06-16 01:39 31,471 GSM7494314_AML14_DX_processed_barcodes.tsv.gz
2023-06-16 01:39 83,070 GSM7494314_AML14_DX_processed_genes.tsv.gz
2023-06-16 01:40 61,316,963 GSM7494314_AML14_DX_processed_matrix.mtx.gz
2023-06-16 01:40 196,533 GSM7494314_AML14_DX_processed_metadata.tsv.gz
2023-06-16 01:40 19,367,602 GSM7494314_AML14_DX_raw_barcodes.tsv.gz
2023-06-16 01:40 254,821 GSM7494314_AML14_DX_raw_genes.tsv.gz
2023-06-16 01:40 89,692,805 GSM7494314_AML14_DX_raw_matrix.mtx.gz
2023-06-16 01:40 7,711 GSM7494315_AML14_REM_processed_barcodes.tsv.gz
2023-06-16 01:40 83,071 GSM7494315_AML14_REM_processed_genes.tsv.gz
2023-06-16 01:40 10,598,432 GSM7494315_AML14_REM_processed_matrix.mtx.gz
2023-06-16 01:40 43,674 GSM7494315_AML14_REM_processed_metadata.tsv.gz
2023-06-16 01:40 19,367,603 GSM7494315_AML14_REM_raw_barcodes.tsv.gz
2023-06-16 01:40 254,822 GSM7494315_AML14_REM_raw_genes.tsv.gz
2023-06-16 01:41 25,345,120 GSM7494315_AML14_REM_raw_matrix.mtx.gz
2023-06-16 01:41 27,580 GSM7494316_AML25_DX_processed_barcodes.tsv.gz
2023-06-16 01:41 84,863 GSM7494316_AML25_DX_processed_genes.tsv.gz
2023-06-16 01:41 40,644,598 GSM7494316_AML25_DX_processed_matrix.mtx.gz
2023-06-16 01:41 168,141 GSM7494316_AML25_DX_processed_metadata.tsv.gz
2023-06-16 01:41 19,367,602 GSM7494316_AML25_DX_raw_barcodes.tsv.gz
2023-06-16 01:41 254,821 GSM7494316_AML25_DX_raw_genes.tsv.gz
2023-06-16 01:41 63,812,052 GSM7494316_AML25_DX_raw_matrix.mtx.gz
2023-06-16 01:41 21,792 GSM7494317_AML25_REL_processed_barcodes.tsv.gz
2023-06-16 01:41 84,864 GSM7494317_AML25_REL_processed_genes.tsv.gz
2023-06-16 01:41 34,618,558 GSM7494317_AML25_REL_processed_matrix.mtx.gz
2023-06-16 01:41 131,755 GSM7494317_AML25_REL_processed_metadata.tsv.gz
2023-06-16 01:42 19,367,603 GSM7494317_AML25_REL_raw_barcodes.tsv.gz
2023-06-16 01:42 254,822 GSM7494317_AML25_REL_raw_genes.tsv.gz
2023-06-16 01:42 55,745,817 GSM7494317_AML25_REL_raw_matrix.mtx.gz
2023-06-16 01:42 18,308 GSM7494318_AML25_REM_processed_barcodes.tsv.gz
2023-06-16 01:42 84,864 GSM7494318_AML25_REM_processed_genes.tsv.gz
2023-06-16 01:42 20,882,373 GSM7494318_AML25_REM_processed_matrix.mtx.gz
2023-06-16 01:42 108,886 GSM7494318_AML25_REM_processed_metadata.tsv.gz
2023-06-16 01:42 19,367,603 GSM7494318_AML25_REM_raw_barcodes.tsv.gz
2023-06-16 01:42 254,822 GSM7494318_AML25_REM_raw_genes.tsv.gz
2023-06-16 01:42 36,642,968 GSM7494318_AML25_REM_raw_matrix.mtx.gz
2023-06-16 01:42 34,277 GSM7494319_AML26_DX_processed_barcodes.tsv.gz
2023-06-16 01:42 85,476 GSM7494319_AML26_DX_processed_genes.tsv.gz
2023-06-16 01:43 50,657,064 GSM7494319_AML26_DX_processed_matrix.mtx.gz
2023-06-16 01:43 212,772 GSM7494319_AML26_DX_processed_metadata.tsv.gz
2023-06-16 01:43 19,367,602 GSM7494319_AML26_DX_raw_barcodes.tsv.gz
2023-06-16 01:43 254,821 GSM7494319_AML26_DX_raw_genes.tsv.gz
2023-06-16 01:43 69,765,437 GSM7494319_AML26_DX_raw_matrix.mtx.gz
2023-06-16 01:43 31,673 GSM7494320_AML26_REL_processed_barcodes.tsv.gz
2023-06-16 01:43 85,477 GSM7494320_AML26_REL_processed_genes.tsv.gz
2023-06-16 01:43 53,222,843 GSM7494320_AML26_REL_processed_matrix.mtx.gz
2023-06-16 01:43 195,228 GSM7494320_AML26_REL_processed_metadata.tsv.gz
2023-06-16 01:44 19,367,603 GSM7494320_AML26_REL_raw_barcodes.tsv.gz
2023-06-16 01:44 254,822 GSM7494320_AML26_REL_raw_genes.tsv.gz
2023-06-16 01:44 70,110,195 GSM7494320_AML26_REL_raw_matrix.mtx.gz
2023-06-16 01:44 14,222 GSM7494321_AML26_REM_processed_barcodes.tsv.gz
2023-06-16 01:44 85,477 GSM7494321_AML26_REM_processed_genes.tsv.gz
2023-06-16 01:44 19,145,731 GSM7494321_AML26_REM_processed_matrix.mtx.gz
2023-06-16 01:44 83,541 GSM7494321_AML26_REM_processed_metadata.tsv.gz
2023-06-16 01:44 19,367,603 GSM7494321_AML26_REM_raw_barcodes.tsv.gz
2023-06-16 01:44 254,822 GSM7494321_AML26_REM_raw_genes.tsv.gz
2023-06-16 01:44 34,243,230 GSM7494321_AML26_REM_raw_matrix.mtx.gz
2023-06-16 01:44 23,266 GSM7494322_AML19_DX_processed_barcodes.tsv.gz
2023-06-16 01:44 79,634 GSM7494322_AML19_DX_processed_genes.tsv.gz
2023-06-16 01:45 43,123,523 GSM7494322_AML19_DX_processed_matrix.mtx.gz
2023-06-16 01:45 141,973 GSM7494322_AML19_DX_processed_metadata.tsv.gz
2023-06-16 01:45 19,367,602 GSM7494322_AML19_DX_raw_barcodes.tsv.gz
2023-06-16 01:45 254,821 GSM7494322_AML19_DX_raw_genes.tsv.gz
2023-06-16 01:45 58,552,443 GSM7494322_AML19_DX_raw_matrix.mtx.gz
2023-06-16 01:45 8,638 GSM7494323_AML19_REL_processed_barcodes.tsv.gz
2023-06-16 01:45 79,635 GSM7494323_AML19_REL_processed_genes.tsv.gz
2023-06-16 01:45 18,591,088 GSM7494323_AML19_REL_processed_matrix.mtx.gz
2023-06-16 01:45 49,598 GSM7494323_AML19_REL_processed_metadata.tsv.gz
2023-06-16 01:45 19,367,603 GSM7494323_AML19_REL_raw_barcodes.tsv.gz
2023-06-16 01:45 254,822 GSM7494323_AML19_REL_raw_genes.tsv.gz
2023-06-16 01:45 28,869,134 GSM7494323_AML19_REL_raw_matrix.mtx.gz
2023-06-16 01:45 32,079 GSM7494324_AML18_DX_processed_barcodes.tsv.gz
2023-06-16 01:45 80,632 GSM7494324_AML18_DX_processed_genes.tsv.gz
2023-06-16 01:46 64,292,794 GSM7494324_AML18_DX_processed_matrix.mtx.gz
2023-06-16 01:46 200,001 GSM7494324_AML18_DX_processed_metadata.tsv.gz
2023-06-16 01:46 19,367,602 GSM7494324_AML18_DX_raw_barcodes.tsv.gz
2023-06-16 01:46 254,821 GSM7494324_AML18_DX_raw_genes.tsv.gz
2023-06-16 01:46 89,327,411 GSM7494324_AML18_DX_raw_matrix.mtx.gz
2023-06-16 01:46 13,415 GSM7494325_AML18_REL_processed_barcodes.tsv.gz
2023-06-16 01:46 80,633 GSM7494325_AML18_REL_processed_genes.tsv.gz
2023-06-16 01:46 15,992,541 GSM7494325_AML18_REL_processed_matrix.mtx.gz
2023-06-16 01:46 76,862 GSM7494325_AML18_REL_processed_metadata.tsv.gz
2023-06-16 01:47 19,367,603 GSM7494325_AML18_REL_raw_barcodes.tsv.gz
2023-06-16 01:47 254,822 GSM7494325_AML18_REL_raw_genes.tsv.gz
2023-06-16 01:47 43,200,516 GSM7494325_AML18_REL_raw_matrix.mtx.gz
2023-06-16 01:47 33,194 GSM7494326_AML12_DX_processed_barcodes.tsv.gz
2023-06-16 01:47 84,192 GSM7494326_AML12_DX_processed_genes.tsv.gz
2023-06-16 01:47 66,450,048 GSM7494326_AML12_DX_processed_matrix.mtx.gz
2023-06-16 01:47 207,398 GSM7494326_AML12_DX_processed_metadata.tsv.gz
2023-06-16 01:47 19,367,602 GSM7494326_AML12_DX_raw_barcodes.tsv.gz
2023-06-16 01:47 254,821 GSM7494326_AML12_DX_raw_genes.tsv.gz
2023-06-16 01:48 98,958,970 GSM7494326_AML12_DX_raw_matrix.mtx.gz
2023-06-16 01:48 21,282 GSM7494327_AML12_REL_processed_barcodes.tsv.gz
2023-06-16 01:48 84,193 GSM7494327_AML12_REL_processed_genes.tsv.gz
2023-06-16 01:48 38,509,535 GSM7494327_AML12_REL_processed_matrix.mtx.gz
2023-06-16 01:48 128,029 GSM7494327_AML12_REL_processed_metadata.tsv.gz
2023-06-16 01:48 19,367,603 GSM7494327_AML12_REL_raw_barcodes.tsv.gz
2023-06-16 01:48 254,822 GSM7494327_AML12_REL_raw_genes.tsv.gz
2023-06-16 01:48 72,609,549 GSM7494327_AML12_REL_raw_matrix.mtx.gz
2023-06-16 01:48 15,382 GSM7494328_AML12_REM_processed_barcodes.tsv.gz
2023-06-16 01:48 84,193 GSM7494328_AML12_REM_processed_genes.tsv.gz
2023-06-16 01:49 20,389,328 GSM7494328_AML12_REM_processed_matrix.mtx.gz
2023-06-16 01:49 90,294 GSM7494328_AML12_REM_processed_metadata.tsv.gz
2023-06-16 01:49 19,367,603 GSM7494328_AML12_REM_raw_barcodes.tsv.gz
2023-06-16 01:49 254,822 GSM7494328_AML12_REM_raw_genes.tsv.gz
2023-06-16 01:49 40,579,509 GSM7494328_AML12_REM_raw_matrix.mtx.gz
2023-06-16 01:49 20,038 GSM7494329_AML13_DX_processed_barcodes.tsv.gz
2023-06-16 01:49 84,312 GSM7494329_AML13_DX_processed_genes.tsv.gz
2023-06-16 01:49 23,394,461 GSM7494329_AML13_DX_processed_matrix.mtx.gz
2023-06-16 01:49 119,791 GSM7494329_AML13_DX_processed_metadata.tsv.gz
2023-06-16 01:49 19,367,602 GSM7494329_AML13_DX_raw_barcodes.tsv.gz
2023-06-16 01:49 254,821 GSM7494329_AML13_DX_raw_genes.tsv.gz
2023-06-16 01:50 51,422,961 GSM7494329_AML13_DX_raw_matrix.mtx.gz
2023-06-16 01:50 29,948 GSM7494330_AML13_REL_processed_barcodes.tsv.gz
2023-06-16 01:50 84,313 GSM7494330_AML13_REL_processed_genes.tsv.gz
2023-06-16 01:50 46,599,252 GSM7494330_AML13_REL_processed_matrix.mtx.gz
2023-06-16 01:50 184,833 GSM7494330_AML13_REL_processed_metadata.tsv.gz
2023-06-16 01:50 19,367,603 GSM7494330_AML13_REL_raw_barcodes.tsv.gz
2023-06-16 01:50 254,822 GSM7494330_AML13_REL_raw_genes.tsv.gz
2023-06-16 01:50 70,788,361 GSM7494330_AML13_REL_raw_matrix.mtx.gz
2023-06-16 01:50 28,887 GSM7494331_AML13_REM_processed_barcodes.tsv.gz
2023-06-16 01:50 84,313 GSM7494331_AML13_REM_processed_genes.tsv.gz
2023-06-16 01:50 24,651,210 GSM7494331_AML13_REM_processed_matrix.mtx.gz
2023-06-16 01:50 177,066 GSM7494331_AML13_REM_processed_metadata.tsv.gz
2023-06-16 01:51 19,367,603 GSM7494331_AML13_REM_raw_barcodes.tsv.gz
2023-06-16 01:51 254,822 GSM7494331_AML13_REM_raw_genes.tsv.gz
2023-06-16 01:51 64,901,299 GSM7494331_AML13_REM_raw_matrix.mtx.gz
526 个文件 15,087,903,462 字节
2 个目录 204,743,172,096 可用字节
# 改工作目录
os.chdir('data')
print("当前工作目录:", os.getcwd())
当前工作目录: C:\Users\Administrator\Desktop\test\data
# 使用 os.system 调用 PowerShell 脚本进行批量重命名
rename_script = '''
powershell -Command "Get-ChildItem -Filter '*genes*' | Rename-Item -NewName {$_.Name -replace 'genes', 'features'}"
'''
# 执行 PowerShell 命令
os.system(rename_script)
0
ls
驱动器 C 中的卷没有标签。
卷的序列号是 10D4-7984
C:\Users\Administrator\Desktop\test\data 的目录
2024-06-18 22:13 <DIR> .
2024-06-18 22:13 <DIR> ..
2024-06-18 17:10 7,544,156,160 GSE235063_RAW.tar
2023-06-16 01:08 29,751 GSM7494257_AML16_DX_processed_barcodes.tsv.gz
2023-06-16 01:08 87,866 GSM7494257_AML16_DX_processed_features.tsv.gz
2023-06-16 01:08 49,880,893 GSM7494257_AML16_DX_processed_matrix.mtx.gz
2023-06-16 01:08 183,537 GSM7494257_AML16_DX_processed_metadata.tsv.gz
2023-06-16 01:08 19,367,602 GSM7494257_AML16_DX_raw_barcodes.tsv.gz
2023-06-16 01:08 254,821 GSM7494257_AML16_DX_raw_features.tsv.gz
2023-06-16 01:09 86,702,202 GSM7494257_AML16_DX_raw_matrix.mtx.gz
2023-06-16 01:09 20,398 GSM7494258_AML16_REL_processed_barcodes.tsv.gz
2023-06-16 01:09 87,867 GSM7494258_AML16_REL_processed_features.tsv.gz
2023-06-16 01:09 30,464,777 GSM7494258_AML16_REL_processed_matrix.mtx.gz
2023-06-16 01:09 121,775 GSM7494258_AML16_REL_processed_metadata.tsv.gz
2023-06-16 01:09 19,367,603 GSM7494258_AML16_REL_raw_barcodes.tsv.gz
2023-06-16 01:09 254,822 GSM7494258_AML16_REL_raw_features.tsv.gz
2023-06-16 01:10 62,076,582 GSM7494258_AML16_REL_raw_matrix.mtx.gz
2023-06-16 01:10 23,047 GSM7494259_AML16_REM_processed_barcodes.tsv.gz
2023-06-16 01:10 87,867 GSM7494259_AML16_REM_processed_features.tsv.gz
2023-06-16 01:10 33,922,410 GSM7494259_AML16_REM_processed_matrix.mtx.gz
2023-06-16 01:10 138,763 GSM7494259_AML16_REM_processed_metadata.tsv.gz
2023-06-16 01:10 19,367,603 GSM7494259_AML16_REM_raw_barcodes.tsv.gz
2023-06-16 01:10 254,822 GSM7494259_AML16_REM_raw_features.tsv.gz
2023-06-16 01:10 56,851,099 GSM7494259_AML16_REM_raw_matrix.mtx.gz
2023-06-16 01:10 28,064 GSM7494260_AML6_DX_processed_barcodes.tsv.gz
2023-06-16 01:10 83,951 GSM7494260_AML6_DX_processed_features.tsv.gz
2023-06-16 01:10 43,102,857 GSM7494260_AML6_DX_processed_matrix.mtx.gz
2023-06-16 01:10 170,376 GSM7494260_AML6_DX_processed_metadata.tsv.gz
2023-06-16 01:11 19,367,601 GSM7494260_AML6_DX_raw_barcodes.tsv.gz
2023-06-16 01:11 254,820 GSM7494260_AML6_DX_raw_features.tsv.gz
2023-06-16 01:11 79,014,793 GSM7494260_AML6_DX_raw_matrix.mtx.gz
2023-06-16 01:11 23,921 GSM7494261_AML6_REL_processed_barcodes.tsv.gz
2023-06-16 01:11 83,952 GSM7494261_AML6_REL_processed_features.tsv.gz
2023-06-16 01:11 38,350,741 GSM7494261_AML6_REL_processed_matrix.mtx.gz
2023-06-16 01:11 143,156 GSM7494261_AML6_REL_processed_metadata.tsv.gz
2023-06-16 01:11 19,367,602 GSM7494261_AML6_REL_raw_barcodes.tsv.gz
2023-06-16 01:11 254,821 GSM7494261_AML6_REL_raw_features.tsv.gz
2023-06-16 01:12 65,441,183 GSM7494261_AML6_REL_raw_matrix.mtx.gz
2023-06-16 01:12 16,648 GSM7494262_AML6_REM_processed_barcodes.tsv.gz
2023-06-16 01:12 83,952 GSM7494262_AML6_REM_processed_features.tsv.gz
2023-06-16 01:12 19,590,433 GSM7494262_AML6_REM_processed_matrix.mtx.gz
2023-06-16 01:12 98,210 GSM7494262_AML6_REM_processed_metadata.tsv.gz
2023-06-16 01:12 19,367,602 GSM7494262_AML6_REM_raw_barcodes.tsv.gz
2023-06-16 01:12 254,821 GSM7494262_AML6_REM_raw_features.tsv.gz
2023-06-16 01:12 41,173,295 GSM7494262_AML6_REM_raw_matrix.mtx.gz
2023-06-16 01:12 22,227 GSM7494263_AML2_DX_processed_barcodes.tsv.gz
2023-06-16 01:12 85,538 GSM7494263_AML2_DX_processed_features.tsv.gz
2023-06-16 01:12 31,590,399 GSM7494263_AML2_DX_processed_matrix.mtx.gz
2023-06-16 01:12 133,746 GSM7494263_AML2_DX_processed_metadata.tsv.gz
2023-06-16 01:12 19,367,601 GSM7494263_AML2_DX_raw_barcodes.tsv.gz
2023-06-16 01:12 254,820 GSM7494263_AML2_DX_raw_features.tsv.gz
2023-06-16 01:13 49,031,697 GSM7494263_AML2_DX_raw_matrix.mtx.gz
2023-06-16 01:13 24,812 GSM7494264_AML2_REL_processed_barcodes.tsv.gz
2023-06-16 01:13 85,539 GSM7494264_AML2_REL_processed_features.tsv.gz
2023-06-16 01:13 38,593,511 GSM7494264_AML2_REL_processed_matrix.mtx.gz
2023-06-16 01:13 150,329 GSM7494264_AML2_REL_processed_metadata.tsv.gz
2023-06-16 01:13 19,367,602 GSM7494264_AML2_REL_raw_barcodes.tsv.gz
2023-06-16 01:13 254,821 GSM7494264_AML2_REL_raw_features.tsv.gz
2023-06-16 01:13 58,768,408 GSM7494264_AML2_REL_raw_matrix.mtx.gz
2023-06-16 01:13 9,096 GSM7494265_AML2_REM_processed_barcodes.tsv.gz
2023-06-16 01:13 85,539 GSM7494265_AML2_REM_processed_features.tsv.gz
2023-06-16 01:13 13,625,454 GSM7494265_AML2_REM_processed_matrix.mtx.gz
2023-06-16 01:13 52,148 GSM7494265_AML2_REM_processed_metadata.tsv.gz
2023-06-16 01:14 19,367,602 GSM7494265_AML2_REM_raw_barcodes.tsv.gz
2023-06-16 01:14 254,821 GSM7494265_AML2_REM_raw_features.tsv.gz
2023-06-16 01:14 27,780,603 GSM7494265_AML2_REM_raw_matrix.mtx.gz
2023-06-16 01:14 26,757 GSM7494266_AML15_DX_processed_barcodes.tsv.gz
2023-06-16 01:14 83,451 GSM7494266_AML15_DX_processed_features.tsv.gz
2023-06-16 01:14 35,485,678 GSM7494266_AML15_DX_processed_matrix.mtx.gz
2023-06-16 01:14 164,393 GSM7494266_AML15_DX_processed_metadata.tsv.gz
2023-06-16 01:14 19,367,602 GSM7494266_AML15_DX_raw_barcodes.tsv.gz
2023-06-16 01:14 254,821 GSM7494266_AML15_DX_raw_features.tsv.gz
2023-06-16 01:14 56,631,932 GSM7494266_AML15_DX_raw_matrix.mtx.gz
2023-06-16 01:14 23,621 GSM7494267_AML15_REL_processed_barcodes.tsv.gz
2023-06-16 01:14 83,452 GSM7494267_AML15_REL_processed_features.tsv.gz
2023-06-16 01:14 36,754,915 GSM7494267_AML15_REL_processed_matrix.mtx.gz
2023-06-16 01:14 144,466 GSM7494267_AML15_REL_processed_metadata.tsv.gz
2023-06-16 01:15 19,367,603 GSM7494267_AML15_REL_raw_barcodes.tsv.gz
2023-06-16 01:15 254,822 GSM7494267_AML15_REL_raw_features.tsv.gz
2023-06-16 01:15 57,113,401 GSM7494267_AML15_REL_raw_matrix.mtx.gz
2023-06-16 01:15 8,980 GSM7494268_AML15_REM_processed_barcodes.tsv.gz
2023-06-16 01:15 83,452 GSM7494268_AML15_REM_processed_features.tsv.gz
2023-06-16 01:15 11,162,022 GSM7494268_AML15_REM_processed_matrix.mtx.gz
2023-06-16 01:15 51,054 GSM7494268_AML15_REM_processed_metadata.tsv.gz
2023-06-16 01:15 19,367,603 GSM7494268_AML15_REM_raw_barcodes.tsv.gz
2023-06-16 01:15 254,822 GSM7494268_AML15_REM_raw_features.tsv.gz
2023-06-16 01:15 22,627,500 GSM7494268_AML15_REM_raw_matrix.mtx.gz
2023-06-16 01:15 36,658 GSM7494269_AML3_DX_processed_barcodes.tsv.gz
2023-06-16 01:15 83,598 GSM7494269_AML3_DX_processed_features.tsv.gz
2023-06-16 01:16 63,540,161 GSM7494269_AML3_DX_processed_matrix.mtx.gz
2023-06-16 01:16 229,153 GSM7494269_AML3_DX_processed_metadata.tsv.gz
2023-06-16 01:16 19,367,601 GSM7494269_AML3_DX_raw_barcodes.tsv.gz
2023-06-16 01:16 254,820 GSM7494269_AML3_DX_raw_features.tsv.gz
2023-06-16 01:16 84,995,686 GSM7494269_AML3_DX_raw_matrix.mtx.gz
2023-06-16 01:16 12,787 GSM7494270_AML3_REM_processed_barcodes.tsv.gz
2023-06-16 01:16 83,599 GSM7494270_AML3_REM_processed_features.tsv.gz
2023-06-16 01:16 14,219,196 GSM7494270_AML3_REM_processed_matrix.mtx.gz
2023-06-16 01:16 74,567 GSM7494270_AML3_REM_processed_metadata.tsv.gz
2023-06-16 01:16 19,367,602 GSM7494270_AML3_REM_raw_barcodes.tsv.gz
2023-06-16 01:16 254,821 GSM7494270_AML3_REM_raw_features.tsv.gz
2023-06-16 01:16 26,216,012 GSM7494270_AML3_REM_raw_matrix.mtx.gz
2023-06-16 01:16 317 GSM7494271_AML7_DX_processed_barcodes.tsv.gz
2023-06-16 01:16 76,073 GSM7494271_AML7_DX_processed_features.tsv.gz
2023-06-16 01:16 159,133 GSM7494271_AML7_DX_processed_matrix.mtx.gz
2023-06-16 01:16 1,596 GSM7494271_AML7_DX_processed_metadata.tsv.gz
2023-06-16 01:17 19,367,601 GSM7494271_AML7_DX_raw_barcodes.tsv.gz
2023-06-16 01:17 254,820 GSM7494271_AML7_DX_raw_features.tsv.gz
2023-06-16 01:17 9,022,172 GSM7494271_AML7_DX_raw_matrix.mtx.gz
2023-06-16 01:17 11,530 GSM7494272_AML7_REL_processed_barcodes.tsv.gz
2023-06-16 01:17 76,074 GSM7494272_AML7_REL_processed_features.tsv.gz
2023-06-16 01:17 9,654,849 GSM7494272_AML7_REL_processed_matrix.mtx.gz
2023-06-16 01:17 65,123 GSM7494272_AML7_REL_processed_metadata.tsv.gz
2023-06-16 01:17 19,367,602 GSM7494272_AML7_REL_raw_barcodes.tsv.gz
2023-06-16 01:17 254,821 GSM7494272_AML7_REL_raw_features.tsv.gz
2023-06-16 01:17 20,279,967 GSM7494272_AML7_REL_raw_matrix.mtx.gz
2023-06-16 01:17 10,759 GSM7494273_AML7_REM_processed_barcodes.tsv.gz
2023-06-16 01:17 76,074 GSM7494273_AML7_REM_processed_features.tsv.gz
2023-06-16 01:17 10,413,802 GSM7494273_AML7_REM_processed_matrix.mtx.gz
2023-06-16 01:17 61,428 GSM7494273_AML7_REM_processed_metadata.tsv.gz
2023-06-16 01:17 19,367,602 GSM7494273_AML7_REM_raw_barcodes.tsv.gz
2023-06-16 01:17 254,821 GSM7494273_AML7_REM_raw_features.tsv.gz
2023-06-16 01:17 19,064,329 GSM7494273_AML7_REM_raw_matrix.mtx.gz
2023-06-16 01:17 20,032 GSM7494274_AML8_DX_processed_barcodes.tsv.gz
2023-06-16 01:17 80,089 GSM7494274_AML8_DX_processed_features.tsv.gz
2023-06-16 01:18 21,122,226 GSM7494274_AML8_DX_processed_matrix.mtx.gz
2023-06-16 01:18 118,859 GSM7494274_AML8_DX_processed_metadata.tsv.gz
2023-06-16 01:18 19,367,601 GSM7494274_AML8_DX_raw_barcodes.tsv.gz
2023-06-16 01:18 254,820 GSM7494274_AML8_DX_raw_features.tsv.gz
2023-06-16 01:18 50,099,759 GSM7494274_AML8_DX_raw_matrix.mtx.gz
2023-06-16 01:18 23,199 GSM7494275_AML8_REL_processed_barcodes.tsv.gz
2023-06-16 01:18 80,090 GSM7494275_AML8_REL_processed_features.tsv.gz
2023-06-16 01:18 28,644,068 GSM7494275_AML8_REL_processed_matrix.mtx.gz
2023-06-16 01:18 139,219 GSM7494275_AML8_REL_processed_metadata.tsv.gz
2023-06-16 01:18 19,367,602 GSM7494275_AML8_REL_raw_barcodes.tsv.gz
2023-06-16 01:18 254,821 GSM7494275_AML8_REL_raw_features.tsv.gz
2023-06-16 01:18 44,520,290 GSM7494275_AML8_REL_raw_matrix.mtx.gz
2023-06-16 01:18 14,314 GSM7494276_AML8_REM_processed_barcodes.tsv.gz
2023-06-16 01:18 80,090 GSM7494276_AML8_REM_processed_features.tsv.gz
2023-06-16 01:19 14,431,401 GSM7494276_AML8_REM_processed_matrix.mtx.gz
2023-06-16 01:19 82,740 GSM7494276_AML8_REM_processed_metadata.tsv.gz
2023-06-16 01:19 19,367,602 GSM7494276_AML8_REM_raw_barcodes.tsv.gz
2023-06-16 01:19 254,821 GSM7494276_AML8_REM_raw_features.tsv.gz
2023-06-16 01:19 25,272,238 GSM7494276_AML8_REM_raw_matrix.mtx.gz
2023-06-16 01:19 28,065 GSM7494277_AML20_DX_processed_barcodes.tsv.gz
2023-06-16 01:19 79,460 GSM7494277_AML20_DX_processed_features.tsv.gz
2023-06-16 01:19 45,826,895 GSM7494277_AML20_DX_processed_matrix.mtx.gz
2023-06-16 01:19 172,104 GSM7494277_AML20_DX_processed_metadata.tsv.gz
2023-06-16 01:19 19,367,602 GSM7494277_AML20_DX_raw_barcodes.tsv.gz
2023-06-16 01:19 254,821 GSM7494277_AML20_DX_raw_features.tsv.gz
2023-06-16 01:20 60,089,573 GSM7494277_AML20_DX_raw_matrix.mtx.gz
2023-06-16 01:20 9,787 GSM7494278_AML20_REM_processed_barcodes.tsv.gz
2023-06-16 01:20 79,461 GSM7494278_AML20_REM_processed_features.tsv.gz
2023-06-16 01:20 10,555,065 GSM7494278_AML20_REM_processed_matrix.mtx.gz
2023-06-16 01:20 55,656 GSM7494278_AML20_REM_processed_metadata.tsv.gz
2023-06-16 01:20 19,367,603 GSM7494278_AML20_REM_raw_barcodes.tsv.gz
2023-06-16 01:20 254,822 GSM7494278_AML20_REM_raw_features.tsv.gz
2023-06-16 01:20 26,739,671 GSM7494278_AML20_REM_raw_matrix.mtx.gz
2023-06-16 01:20 23,563 GSM7494279_AML5_DX_processed_barcodes.tsv.gz
2023-06-16 01:20 83,571 GSM7494279_AML5_DX_processed_features.tsv.gz
2023-06-16 01:20 36,819,972 GSM7494279_AML5_DX_processed_matrix.mtx.gz
2023-06-16 01:20 141,912 GSM7494279_AML5_DX_processed_metadata.tsv.gz
2023-06-16 01:20 19,367,601 GSM7494279_AML5_DX_raw_barcodes.tsv.gz
2023-06-16 01:20 254,820 GSM7494279_AML5_DX_raw_features.tsv.gz
2023-06-16 01:21 58,892,249 GSM7494279_AML5_DX_raw_matrix.mtx.gz
2023-06-16 01:21 19,306 GSM7494280_AML5_REL_processed_barcodes.tsv.gz
2023-06-16 01:21 83,572 GSM7494280_AML5_REL_processed_features.tsv.gz
2023-06-16 01:21 28,094,035 GSM7494280_AML5_REL_processed_matrix.mtx.gz
2023-06-16 01:21 113,413 GSM7494280_AML5_REL_processed_metadata.tsv.gz
2023-06-16 01:21 19,367,602 GSM7494280_AML5_REL_raw_barcodes.tsv.gz
2023-06-16 01:21 254,821 GSM7494280_AML5_REL_raw_features.tsv.gz
2023-06-16 01:21 48,609,297 GSM7494280_AML5_REL_raw_matrix.mtx.gz
2023-06-16 01:21 18,495 GSM7494281_AML5_REM_processed_barcodes.tsv.gz
2023-06-16 01:21 83,572 GSM7494281_AML5_REM_processed_features.tsv.gz
2023-06-16 01:21 19,938,386 GSM7494281_AML5_REM_processed_matrix.mtx.gz
2023-06-16 01:21 109,053 GSM7494281_AML5_REM_processed_metadata.tsv.gz
2023-06-16 01:21 19,367,602 GSM7494281_AML5_REM_raw_barcodes.tsv.gz
2023-06-16 01:21 254,821 GSM7494281_AML5_REM_raw_features.tsv.gz
2023-06-16 01:22 33,901,580 GSM7494281_AML5_REM_raw_matrix.mtx.gz
2023-06-16 01:22 30,249 GSM7494282_AML17_DX_processed_barcodes.tsv.gz
2023-06-16 01:22 79,270 GSM7494282_AML17_DX_processed_features.tsv.gz
2023-06-16 01:22 42,822,120 GSM7494282_AML17_DX_processed_matrix.mtx.gz
2023-06-16 01:22 185,029 GSM7494282_AML17_DX_processed_metadata.tsv.gz
2023-06-16 01:22 19,367,602 GSM7494282_AML17_DX_raw_barcodes.tsv.gz
2023-06-16 01:22 254,821 GSM7494282_AML17_DX_raw_features.tsv.gz
2023-06-16 01:22 61,341,998 GSM7494282_AML17_DX_raw_matrix.mtx.gz
2023-06-16 01:22 39,213 GSM7494283_AML17_REL_processed_barcodes.tsv.gz
2023-06-16 01:22 79,271 GSM7494283_AML17_REL_processed_features.tsv.gz
2023-06-16 01:22 37,678,921 GSM7494283_AML17_REL_processed_matrix.mtx.gz
2023-06-16 01:22 241,897 GSM7494283_AML17_REL_processed_metadata.tsv.gz
2023-06-16 01:23 19,367,603 GSM7494283_AML17_REL_raw_barcodes.tsv.gz
2023-06-16 01:23 254,822 GSM7494283_AML17_REL_raw_features.tsv.gz
2023-06-16 01:23 61,044,197 GSM7494283_AML17_REL_raw_matrix.mtx.gz
2023-06-16 01:23 17,187 GSM7494284_AML1_REM_processed_barcodes.tsv.gz
2023-06-16 01:23 84,014 GSM7494284_AML1_REM_processed_features.tsv.gz
2023-06-16 01:23 28,484,540 GSM7494284_AML1_REM_processed_matrix.mtx.gz
2023-06-16 01:23 102,693 GSM7494284_AML1_REM_processed_metadata.tsv.gz
2023-06-16 01:23 19,367,602 GSM7494284_AML1_REM_raw_barcodes.tsv.gz
2023-06-16 01:23 254,821 GSM7494284_AML1_REM_raw_features.tsv.gz
2023-06-16 01:24 59,182,014 GSM7494284_AML1_REM_raw_matrix.mtx.gz
2023-06-16 01:24 12,319 GSM7494285_AML1_DX_processed_barcodes.tsv.gz
2023-06-16 01:24 84,013 GSM7494285_AML1_DX_processed_features.tsv.gz
2023-06-16 01:24 19,023,140 GSM7494285_AML1_DX_processed_matrix.mtx.gz
2023-06-16 01:24 72,158 GSM7494285_AML1_DX_processed_metadata.tsv.gz
2023-06-16 01:24 19,367,601 GSM7494285_AML1_DX_raw_barcodes.tsv.gz
2023-06-16 01:24 254,820 GSM7494285_AML1_DX_raw_features.tsv.gz
2023-06-16 01:24 37,294,389 GSM7494285_AML1_DX_raw_matrix.mtx.gz
2023-06-16 01:24 18,212 GSM7494286_AML27_DX_processed_barcodes.tsv.gz
2023-06-16 01:24 84,496 GSM7494286_AML27_DX_processed_features.tsv.gz
2023-06-16 01:24 29,578,430 GSM7494286_AML27_DX_processed_matrix.mtx.gz
2023-06-16 01:24 108,535 GSM7494286_AML27_DX_processed_metadata.tsv.gz
2023-06-16 01:24 19,367,602 GSM7494286_AML27_DX_raw_barcodes.tsv.gz
2023-06-16 01:24 254,821 GSM7494286_AML27_DX_raw_features.tsv.gz
2023-06-16 01:25 47,395,430 GSM7494286_AML27_DX_raw_matrix.mtx.gz
2023-06-16 01:25 20,200 GSM7494287_AML27_REL_processed_barcodes.tsv.gz
2023-06-16 01:25 84,497 GSM7494287_AML27_REL_processed_features.tsv.gz
2023-06-16 01:25 32,430,945 GSM7494287_AML27_REL_processed_matrix.mtx.gz
2023-06-16 01:25 122,288 GSM7494287_AML27_REL_processed_metadata.tsv.gz
2023-06-16 01:25 19,367,603 GSM7494287_AML27_REL_raw_barcodes.tsv.gz
2023-06-16 01:25 254,822 GSM7494287_AML27_REL_raw_features.tsv.gz
2023-06-16 01:25 53,159,637 GSM7494287_AML27_REL_raw_matrix.mtx.gz
2023-06-16 01:25 29,374 GSM7494288_AML27_REM_processed_barcodes.tsv.gz
2023-06-16 01:25 84,497 GSM7494288_AML27_REM_processed_features.tsv.gz
2023-06-16 01:25 36,260,650 GSM7494288_AML27_REM_processed_matrix.mtx.gz
2023-06-16 01:25 178,981 GSM7494288_AML27_REM_processed_metadata.tsv.gz
2023-06-16 01:26 19,367,603 GSM7494288_AML27_REM_raw_barcodes.tsv.gz
2023-06-16 01:26 254,822 GSM7494288_AML27_REM_raw_features.tsv.gz
2023-06-16 01:26 54,699,393 GSM7494288_AML27_REM_raw_matrix.mtx.gz
2023-06-16 01:26 34,023 GSM7494289_AML9_DX_processed_barcodes.tsv.gz
2023-06-16 01:26 81,973 GSM7494289_AML9_DX_processed_features.tsv.gz
2023-06-16 01:26 61,149,166 GSM7494289_AML9_DX_processed_matrix.mtx.gz
2023-06-16 01:26 209,615 GSM7494289_AML9_DX_processed_metadata.tsv.gz
2023-06-16 01:26 19,367,601 GSM7494289_AML9_DX_raw_barcodes.tsv.gz
2023-06-16 01:26 254,820 GSM7494289_AML9_DX_raw_features.tsv.gz
2023-06-16 01:27 88,960,064 GSM7494289_AML9_DX_raw_matrix.mtx.gz
2023-06-16 01:27 20,103 GSM7494290_AML9_REL_processed_barcodes.tsv.gz
2023-06-16 01:27 81,974 GSM7494290_AML9_REL_processed_features.tsv.gz
2023-06-16 01:27 18,011,144 GSM7494290_AML9_REL_processed_matrix.mtx.gz
2023-06-16 01:27 117,266 GSM7494290_AML9_REL_processed_metadata.tsv.gz
2023-06-16 01:27 19,367,602 GSM7494290_AML9_REL_raw_barcodes.tsv.gz
2023-06-16 01:27 254,821 GSM7494290_AML9_REL_raw_features.tsv.gz
2023-06-16 01:27 34,459,847 GSM7494290_AML9_REL_raw_matrix.mtx.gz
2023-06-16 01:27 13,146 GSM7494291_AML9_REM_processed_barcodes.tsv.gz
2023-06-16 01:27 81,974 GSM7494291_AML9_REM_processed_features.tsv.gz
2023-06-16 01:27 14,660,074 GSM7494291_AML9_REM_processed_matrix.mtx.gz
2023-06-16 01:27 75,343 GSM7494291_AML9_REM_processed_metadata.tsv.gz
2023-06-16 01:27 19,367,602 GSM7494291_AML9_REM_raw_barcodes.tsv.gz
2023-06-16 01:27 254,821 GSM7494291_AML9_REM_raw_features.tsv.gz
2023-06-16 01:28 30,331,894 GSM7494291_AML9_REM_raw_matrix.mtx.gz
2023-06-16 01:28 23,701 GSM7494292_AML10_DX_processed_barcodes.tsv.gz
2023-06-16 01:28 85,288 GSM7494292_AML10_DX_processed_features.tsv.gz
2023-06-16 01:28 39,537,523 GSM7494292_AML10_DX_processed_matrix.mtx.gz
2023-06-16 01:28 143,906 GSM7494292_AML10_DX_processed_metadata.tsv.gz
2023-06-16 01:28 19,367,602 GSM7494292_AML10_DX_raw_barcodes.tsv.gz
2023-06-16 01:28 254,821 GSM7494292_AML10_DX_raw_features.tsv.gz
2023-06-16 01:28 60,180,315 GSM7494292_AML10_DX_raw_matrix.mtx.gz
2023-06-16 01:28 12,387 GSM7494293_AML10_REL_processed_barcodes.tsv.gz
2023-06-16 01:28 85,289 GSM7494293_AML10_REL_processed_features.tsv.gz
2023-06-16 01:28 16,136,041 GSM7494293_AML10_REL_processed_matrix.mtx.gz
2023-06-16 01:28 70,838 GSM7494293_AML10_REL_processed_metadata.tsv.gz
2023-06-16 01:28 19,367,603 GSM7494293_AML10_REL_raw_barcodes.tsv.gz
2023-06-16 01:28 254,822 GSM7494293_AML10_REL_raw_features.tsv.gz
2023-06-16 01:29 47,428,946 GSM7494293_AML10_REL_raw_matrix.mtx.gz
2023-06-16 01:29 12,852 GSM7494294_AML10_REM_processed_barcodes.tsv.gz
2023-06-16 01:29 85,289 GSM7494294_AML10_REM_processed_features.tsv.gz
2023-06-16 01:29 16,994,640 GSM7494294_AML10_REM_processed_matrix.mtx.gz
2023-06-16 01:29 75,569 GSM7494294_AML10_REM_processed_metadata.tsv.gz
2023-06-16 01:29 19,367,603 GSM7494294_AML10_REM_raw_barcodes.tsv.gz
2023-06-16 01:29 254,822 GSM7494294_AML10_REM_raw_features.tsv.gz
2023-06-16 01:29 31,032,102 GSM7494294_AML10_REM_raw_matrix.mtx.gz
2023-06-16 01:29 31,983 GSM7494295_AML11_DX_processed_barcodes.tsv.gz
2023-06-16 01:29 85,488 GSM7494295_AML11_DX_processed_features.tsv.gz
2023-06-16 01:29 50,814,144 GSM7494295_AML11_DX_processed_matrix.mtx.gz
2023-06-16 01:29 196,371 GSM7494295_AML11_DX_processed_metadata.tsv.gz
2023-06-16 01:30 19,367,602 GSM7494295_AML11_DX_raw_barcodes.tsv.gz
2023-06-16 01:30 254,821 GSM7494295_AML11_DX_raw_features.tsv.gz
2023-06-16 01:30 71,519,175 GSM7494295_AML11_DX_raw_matrix.mtx.gz
2023-06-16 01:30 36,074 GSM7494296_AML11_REL_processed_barcodes.tsv.gz
2023-06-16 01:30 85,489 GSM7494296_AML11_REL_processed_features.tsv.gz
2023-06-16 01:30 31,188,583 GSM7494296_AML11_REL_processed_matrix.mtx.gz
2023-06-16 01:30 217,837 GSM7494296_AML11_REL_processed_metadata.tsv.gz
2023-06-16 01:30 19,367,603 GSM7494296_AML11_REL_raw_barcodes.tsv.gz
2023-06-16 01:30 254,822 GSM7494296_AML11_REL_raw_features.tsv.gz
2023-06-16 01:30 46,608,949 GSM7494296_AML11_REL_raw_matrix.mtx.gz
2023-06-16 01:30 24,672 GSM7494297_AML11_REM_processed_barcodes.tsv.gz
2023-06-16 01:30 85,489 GSM7494297_AML11_REM_processed_features.tsv.gz
2023-06-16 01:31 32,305,699 GSM7494297_AML11_REM_processed_matrix.mtx.gz
2023-06-16 01:31 148,182 GSM7494297_AML11_REM_processed_metadata.tsv.gz
2023-06-16 01:31 19,367,603 GSM7494297_AML11_REM_raw_barcodes.tsv.gz
2023-06-16 01:31 254,822 GSM7494297_AML11_REM_raw_features.tsv.gz
2023-06-16 01:31 50,839,803 GSM7494297_AML11_REM_raw_matrix.mtx.gz
2023-06-16 01:31 21,439 GSM7494298_AML4_DX_processed_barcodes.tsv.gz
2023-06-16 01:31 83,468 GSM7494298_AML4_DX_processed_features.tsv.gz
2023-06-16 01:31 42,796,240 GSM7494298_AML4_DX_processed_matrix.mtx.gz
2023-06-16 01:31 129,642 GSM7494298_AML4_DX_processed_metadata.tsv.gz
2023-06-16 01:31 19,367,601 GSM7494298_AML4_DX_raw_barcodes.tsv.gz
2023-06-16 01:31 254,820 GSM7494298_AML4_DX_raw_features.tsv.gz
2023-06-16 01:32 65,577,474 GSM7494298_AML4_DX_raw_matrix.mtx.gz
2023-06-16 01:32 25,022 GSM7494299_AML4_REL_processed_barcodes.tsv.gz
2023-06-16 01:32 83,469 GSM7494299_AML4_REL_processed_features.tsv.gz
2023-06-16 01:32 50,700,292 GSM7494299_AML4_REL_processed_matrix.mtx.gz
2023-06-16 01:32 153,530 GSM7494299_AML4_REL_processed_metadata.tsv.gz
2023-06-16 01:32 19,367,602 GSM7494299_AML4_REL_raw_barcodes.tsv.gz
2023-06-16 01:32 254,821 GSM7494299_AML4_REL_raw_features.tsv.gz
2023-06-16 01:32 73,283,528 GSM7494299_AML4_REL_raw_matrix.mtx.gz
2023-06-16 01:32 34,124 GSM7494300_AML22_DX_processed_barcodes.tsv.gz
2023-06-16 01:32 80,339 GSM7494300_AML22_DX_processed_features.tsv.gz
2023-06-16 01:33 37,611,467 GSM7494300_AML22_DX_processed_matrix.mtx.gz
2023-06-16 01:33 210,383 GSM7494300_AML22_DX_processed_metadata.tsv.gz
2023-06-16 01:33 19,367,602 GSM7494300_AML22_DX_raw_barcodes.tsv.gz
2023-06-16 01:33 254,821 GSM7494300_AML22_DX_raw_features.tsv.gz
2023-06-16 01:33 60,746,924 GSM7494300_AML22_DX_raw_matrix.mtx.gz
2023-06-16 01:33 17,189 GSM7494301_AML22_REL_processed_barcodes.tsv.gz
2023-06-16 01:33 80,340 GSM7494301_AML22_REL_processed_features.tsv.gz
2023-06-16 01:33 16,384,657 GSM7494301_AML22_REL_processed_matrix.mtx.gz
2023-06-16 01:33 99,385 GSM7494301_AML22_REL_processed_metadata.tsv.gz
2023-06-16 01:33 19,367,603 GSM7494301_AML22_REL_raw_barcodes.tsv.gz
2023-06-16 01:33 254,822 GSM7494301_AML22_REL_raw_features.tsv.gz
2023-06-16 01:33 36,847,384 GSM7494301_AML22_REL_raw_matrix.mtx.gz
2023-06-16 01:33 14,335 GSM7494302_AML22_REM_processed_barcodes.tsv.gz
2023-06-16 01:33 80,340 GSM7494302_AML22_REM_processed_features.tsv.gz
2023-06-16 01:34 9,698,068 GSM7494302_AML22_REM_processed_matrix.mtx.gz
2023-06-16 01:34 81,471 GSM7494302_AML22_REM_processed_metadata.tsv.gz
2023-06-16 01:34 19,367,603 GSM7494302_AML22_REM_raw_barcodes.tsv.gz
2023-06-16 01:34 254,822 GSM7494302_AML22_REM_raw_features.tsv.gz
2023-06-16 01:34 23,500,627 GSM7494302_AML22_REM_raw_matrix.mtx.gz
2023-06-16 01:34 18,349 GSM7494303_AML21_DX_processed_barcodes.tsv.gz
2023-06-16 01:34 81,681 GSM7494303_AML21_DX_processed_features.tsv.gz
2023-06-16 01:34 21,737,585 GSM7494303_AML21_DX_processed_matrix.mtx.gz
2023-06-16 01:34 108,684 GSM7494303_AML21_DX_processed_metadata.tsv.gz
2023-06-16 01:34 19,367,602 GSM7494303_AML21_DX_raw_barcodes.tsv.gz
2023-06-16 01:34 254,821 GSM7494303_AML21_DX_raw_features.tsv.gz
2023-06-16 01:34 36,941,956 GSM7494303_AML21_DX_raw_matrix.mtx.gz
2023-06-16 01:34 23,035 GSM7494304_AML21_REL_processed_barcodes.tsv.gz
2023-06-16 01:34 81,682 GSM7494304_AML21_REL_processed_features.tsv.gz
2023-06-16 01:34 38,690,638 GSM7494304_AML21_REL_processed_matrix.mtx.gz
2023-06-16 01:35 139,497 GSM7494304_AML21_REL_processed_metadata.tsv.gz
2023-06-16 01:35 19,367,603 GSM7494304_AML21_REL_raw_barcodes.tsv.gz
2023-06-16 01:35 254,822 GSM7494304_AML21_REL_raw_features.tsv.gz
2023-06-16 01:35 65,996,782 GSM7494304_AML21_REL_raw_matrix.mtx.gz
2023-06-16 01:35 12,158 GSM7494305_AML21_REM_processed_barcodes.tsv.gz
2023-06-16 01:35 81,682 GSM7494305_AML21_REM_processed_features.tsv.gz
2023-06-16 01:35 10,315,955 GSM7494305_AML21_REM_processed_matrix.mtx.gz
2023-06-16 01:35 69,918 GSM7494305_AML21_REM_processed_metadata.tsv.gz
2023-06-16 01:35 19,367,603 GSM7494305_AML21_REM_raw_barcodes.tsv.gz
2023-06-16 01:35 254,822 GSM7494305_AML21_REM_raw_features.tsv.gz
2023-06-16 01:35 21,874,263 GSM7494305_AML21_REM_raw_matrix.mtx.gz
2023-06-16 01:35 30,641 GSM7494306_AML24_DX_processed_barcodes.tsv.gz
2023-06-16 01:35 79,707 GSM7494306_AML24_DX_processed_features.tsv.gz
2023-06-16 01:36 49,847,491 GSM7494306_AML24_DX_processed_matrix.mtx.gz
2023-06-16 01:36 189,649 GSM7494306_AML24_DX_processed_metadata.tsv.gz
2023-06-16 01:36 19,367,602 GSM7494306_AML24_DX_raw_barcodes.tsv.gz
2023-06-16 01:36 254,821 GSM7494306_AML24_DX_raw_features.tsv.gz
2023-06-16 01:36 73,581,135 GSM7494306_AML24_DX_raw_matrix.mtx.gz
2023-06-16 01:36 6,104 GSM7494307_AML24_REL_processed_barcodes.tsv.gz
2023-06-16 01:36 79,708 GSM7494307_AML24_REL_processed_features.tsv.gz
2023-06-16 01:36 4,367,879 GSM7494307_AML24_REL_processed_matrix.mtx.gz
2023-06-16 01:36 33,457 GSM7494307_AML24_REL_processed_metadata.tsv.gz
2023-06-16 01:36 19,367,603 GSM7494307_AML24_REL_raw_barcodes.tsv.gz
2023-06-16 01:36 254,822 GSM7494307_AML24_REL_raw_features.tsv.gz
2023-06-16 01:36 12,124,445 GSM7494307_AML24_REL_raw_matrix.mtx.gz
2023-06-16 01:36 11,568 GSM7494308_AML24_REM_processed_barcodes.tsv.gz
2023-06-16 01:36 79,708 GSM7494308_AML24_REM_processed_features.tsv.gz
2023-06-16 01:36 6,792,995 GSM7494308_AML24_REM_processed_matrix.mtx.gz
2023-06-16 01:36 64,376 GSM7494308_AML24_REM_processed_metadata.tsv.gz
2023-06-16 01:37 19,367,603 GSM7494308_AML24_REM_raw_barcodes.tsv.gz
2023-06-16 01:37 254,822 GSM7494308_AML24_REM_raw_features.tsv.gz
2023-06-16 01:37 19,158,615 GSM7494308_AML24_REM_raw_matrix.mtx.gz
2023-06-16 01:37 7,225 GSM7494309_AML23_DX_processed_barcodes.tsv.gz
2023-06-16 01:37 80,901 GSM7494309_AML23_DX_processed_features.tsv.gz
2023-06-16 01:37 5,889,839 GSM7494309_AML23_DX_processed_matrix.mtx.gz
2023-06-16 01:37 39,458 GSM7494309_AML23_DX_processed_metadata.tsv.gz
2023-06-16 01:37 19,367,602 GSM7494309_AML23_DX_raw_barcodes.tsv.gz
2023-06-16 01:37 254,821 GSM7494309_AML23_DX_raw_features.tsv.gz
2023-06-16 01:37 16,014,558 GSM7494309_AML23_DX_raw_matrix.mtx.gz
2023-06-16 01:37 34,830 GSM7494310_AML23_REL_processed_barcodes.tsv.gz
2023-06-16 01:37 80,902 GSM7494310_AML23_REL_processed_features.tsv.gz
2023-06-16 01:37 58,387,598 GSM7494310_AML23_REL_processed_matrix.mtx.gz
2023-06-16 01:37 217,147 GSM7494310_AML23_REL_processed_metadata.tsv.gz
2023-06-16 01:37 19,367,603 GSM7494310_AML23_REL_raw_barcodes.tsv.gz
2023-06-16 01:37 254,822 GSM7494310_AML23_REL_raw_features.tsv.gz
2023-06-16 01:38 83,333,215 GSM7494310_AML23_REL_raw_matrix.mtx.gz
2023-06-16 01:38 28,020 GSM7494311_AML23_REM_processed_barcodes.tsv.gz
2023-06-16 01:38 80,902 GSM7494311_AML23_REM_processed_features.tsv.gz
2023-06-16 01:38 30,724,433 GSM7494311_AML23_REM_processed_matrix.mtx.gz
2023-06-16 01:38 170,279 GSM7494311_AML23_REM_processed_metadata.tsv.gz
2023-06-16 01:38 19,367,603 GSM7494311_AML23_REM_raw_barcodes.tsv.gz
2023-06-16 01:38 254,822 GSM7494311_AML23_REM_raw_features.tsv.gz
2023-06-16 01:38 52,166,436 GSM7494311_AML23_REM_raw_matrix.mtx.gz
2023-06-16 01:38 29,063 GSM7494312_AML28_REL_processed_barcodes.tsv.gz
2023-06-16 01:38 79,976 GSM7494312_AML28_REL_processed_features.tsv.gz
2023-06-16 01:39 39,487,825 GSM7494312_AML28_REL_processed_matrix.mtx.gz
2023-06-16 01:39 178,547 GSM7494312_AML28_REL_processed_metadata.tsv.gz
2023-06-16 01:39 19,367,603 GSM7494312_AML28_REL_raw_barcodes.tsv.gz
2023-06-16 01:39 254,822 GSM7494312_AML28_REL_raw_features.tsv.gz
2023-06-16 01:39 55,665,343 GSM7494312_AML28_REL_raw_matrix.mtx.gz
2023-06-16 01:39 2,804 GSM7494313_AML28_REM_processed_barcodes.tsv.gz
2023-06-16 01:39 79,976 GSM7494313_AML28_REM_processed_features.tsv.gz
2023-06-16 01:39 3,198,093 GSM7494313_AML28_REM_processed_matrix.mtx.gz
2023-06-16 01:39 14,850 GSM7494313_AML28_REM_processed_metadata.tsv.gz
2023-06-16 01:39 19,367,603 GSM7494313_AML28_REM_raw_barcodes.tsv.gz
2023-06-16 01:39 254,822 GSM7494313_AML28_REM_raw_features.tsv.gz
2023-06-16 01:39 14,166,077 GSM7494313_AML28_REM_raw_matrix.mtx.gz
2023-06-16 01:39 31,471 GSM7494314_AML14_DX_processed_barcodes.tsv.gz
2023-06-16 01:39 83,070 GSM7494314_AML14_DX_processed_features.tsv.gz
2023-06-16 01:40 61,316,963 GSM7494314_AML14_DX_processed_matrix.mtx.gz
2023-06-16 01:40 196,533 GSM7494314_AML14_DX_processed_metadata.tsv.gz
2023-06-16 01:40 19,367,602 GSM7494314_AML14_DX_raw_barcodes.tsv.gz
2023-06-16 01:40 254,821 GSM7494314_AML14_DX_raw_features.tsv.gz
2023-06-16 01:40 89,692,805 GSM7494314_AML14_DX_raw_matrix.mtx.gz
2023-06-16 01:40 7,711 GSM7494315_AML14_REM_processed_barcodes.tsv.gz
2023-06-16 01:40 83,071 GSM7494315_AML14_REM_processed_features.tsv.gz
2023-06-16 01:40 10,598,432 GSM7494315_AML14_REM_processed_matrix.mtx.gz
2023-06-16 01:40 43,674 GSM7494315_AML14_REM_processed_metadata.tsv.gz
2023-06-16 01:40 19,367,603 GSM7494315_AML14_REM_raw_barcodes.tsv.gz
2023-06-16 01:40 254,822 GSM7494315_AML14_REM_raw_features.tsv.gz
2023-06-16 01:41 25,345,120 GSM7494315_AML14_REM_raw_matrix.mtx.gz
2023-06-16 01:41 27,580 GSM7494316_AML25_DX_processed_barcodes.tsv.gz
2023-06-16 01:41 84,863 GSM7494316_AML25_DX_processed_features.tsv.gz
2023-06-16 01:41 40,644,598 GSM7494316_AML25_DX_processed_matrix.mtx.gz
2023-06-16 01:41 168,141 GSM7494316_AML25_DX_processed_metadata.tsv.gz
2023-06-16 01:41 19,367,602 GSM7494316_AML25_DX_raw_barcodes.tsv.gz
2023-06-16 01:41 254,821 GSM7494316_AML25_DX_raw_features.tsv.gz
2023-06-16 01:41 63,812,052 GSM7494316_AML25_DX_raw_matrix.mtx.gz
2023-06-16 01:41 21,792 GSM7494317_AML25_REL_processed_barcodes.tsv.gz
2023-06-16 01:41 84,864 GSM7494317_AML25_REL_processed_features.tsv.gz
2023-06-16 01:41 34,618,558 GSM7494317_AML25_REL_processed_matrix.mtx.gz
2023-06-16 01:41 131,755 GSM7494317_AML25_REL_processed_metadata.tsv.gz
2023-06-16 01:42 19,367,603 GSM7494317_AML25_REL_raw_barcodes.tsv.gz
2023-06-16 01:42 254,822 GSM7494317_AML25_REL_raw_features.tsv.gz
2023-06-16 01:42 55,745,817 GSM7494317_AML25_REL_raw_matrix.mtx.gz
2023-06-16 01:42 18,308 GSM7494318_AML25_REM_processed_barcodes.tsv.gz
2023-06-16 01:42 84,864 GSM7494318_AML25_REM_processed_features.tsv.gz
2023-06-16 01:42 20,882,373 GSM7494318_AML25_REM_processed_matrix.mtx.gz
2023-06-16 01:42 108,886 GSM7494318_AML25_REM_processed_metadata.tsv.gz
2023-06-16 01:42 19,367,603 GSM7494318_AML25_REM_raw_barcodes.tsv.gz
2023-06-16 01:42 254,822 GSM7494318_AML25_REM_raw_features.tsv.gz
2023-06-16 01:42 36,642,968 GSM7494318_AML25_REM_raw_matrix.mtx.gz
2023-06-16 01:42 34,277 GSM7494319_AML26_DX_processed_barcodes.tsv.gz
2023-06-16 01:42 85,476 GSM7494319_AML26_DX_processed_features.tsv.gz
2023-06-16 01:43 50,657,064 GSM7494319_AML26_DX_processed_matrix.mtx.gz
2023-06-16 01:43 212,772 GSM7494319_AML26_DX_processed_metadata.tsv.gz
2023-06-16 01:43 19,367,602 GSM7494319_AML26_DX_raw_barcodes.tsv.gz
2023-06-16 01:43 254,821 GSM7494319_AML26_DX_raw_features.tsv.gz
2023-06-16 01:43 69,765,437 GSM7494319_AML26_DX_raw_matrix.mtx.gz
2023-06-16 01:43 31,673 GSM7494320_AML26_REL_processed_barcodes.tsv.gz
2023-06-16 01:43 85,477 GSM7494320_AML26_REL_processed_features.tsv.gz
2023-06-16 01:43 53,222,843 GSM7494320_AML26_REL_processed_matrix.mtx.gz
2023-06-16 01:43 195,228 GSM7494320_AML26_REL_processed_metadata.tsv.gz
2023-06-16 01:44 19,367,603 GSM7494320_AML26_REL_raw_barcodes.tsv.gz
2023-06-16 01:44 254,822 GSM7494320_AML26_REL_raw_features.tsv.gz
2023-06-16 01:44 70,110,195 GSM7494320_AML26_REL_raw_matrix.mtx.gz
2023-06-16 01:44 14,222 GSM7494321_AML26_REM_processed_barcodes.tsv.gz
2023-06-16 01:44 85,477 GSM7494321_AML26_REM_processed_features.tsv.gz
2023-06-16 01:44 19,145,731 GSM7494321_AML26_REM_processed_matrix.mtx.gz
2023-06-16 01:44 83,541 GSM7494321_AML26_REM_processed_metadata.tsv.gz
2023-06-16 01:44 19,367,603 GSM7494321_AML26_REM_raw_barcodes.tsv.gz
2023-06-16 01:44 254,822 GSM7494321_AML26_REM_raw_features.tsv.gz
2023-06-16 01:44 34,243,230 GSM7494321_AML26_REM_raw_matrix.mtx.gz
2023-06-16 01:44 23,266 GSM7494322_AML19_DX_processed_barcodes.tsv.gz
2023-06-16 01:44 79,634 GSM7494322_AML19_DX_processed_features.tsv.gz
2023-06-16 01:45 43,123,523 GSM7494322_AML19_DX_processed_matrix.mtx.gz
2023-06-16 01:45 141,973 GSM7494322_AML19_DX_processed_metadata.tsv.gz
2023-06-16 01:45 19,367,602 GSM7494322_AML19_DX_raw_barcodes.tsv.gz
2023-06-16 01:45 254,821 GSM7494322_AML19_DX_raw_features.tsv.gz
2023-06-16 01:45 58,552,443 GSM7494322_AML19_DX_raw_matrix.mtx.gz
2023-06-16 01:45 8,638 GSM7494323_AML19_REL_processed_barcodes.tsv.gz
2023-06-16 01:45 79,635 GSM7494323_AML19_REL_processed_features.tsv.gz
2023-06-16 01:45 18,591,088 GSM7494323_AML19_REL_processed_matrix.mtx.gz
2023-06-16 01:45 49,598 GSM7494323_AML19_REL_processed_metadata.tsv.gz
2023-06-16 01:45 19,367,603 GSM7494323_AML19_REL_raw_barcodes.tsv.gz
2023-06-16 01:45 254,822 GSM7494323_AML19_REL_raw_features.tsv.gz
2023-06-16 01:45 28,869,134 GSM7494323_AML19_REL_raw_matrix.mtx.gz
2023-06-16 01:45 32,079 GSM7494324_AML18_DX_processed_barcodes.tsv.gz
2023-06-16 01:45 80,632 GSM7494324_AML18_DX_processed_features.tsv.gz
2023-06-16 01:46 64,292,794 GSM7494324_AML18_DX_processed_matrix.mtx.gz
2023-06-16 01:46 200,001 GSM7494324_AML18_DX_processed_metadata.tsv.gz
2023-06-16 01:46 19,367,602 GSM7494324_AML18_DX_raw_barcodes.tsv.gz
2023-06-16 01:46 254,821 GSM7494324_AML18_DX_raw_features.tsv.gz
2023-06-16 01:46 89,327,411 GSM7494324_AML18_DX_raw_matrix.mtx.gz
2023-06-16 01:46 13,415 GSM7494325_AML18_REL_processed_barcodes.tsv.gz
2023-06-16 01:46 80,633 GSM7494325_AML18_REL_processed_features.tsv.gz
2023-06-16 01:46 15,992,541 GSM7494325_AML18_REL_processed_matrix.mtx.gz
2023-06-16 01:46 76,862 GSM7494325_AML18_REL_processed_metadata.tsv.gz
2023-06-16 01:47 19,367,603 GSM7494325_AML18_REL_raw_barcodes.tsv.gz
2023-06-16 01:47 254,822 GSM7494325_AML18_REL_raw_features.tsv.gz
2023-06-16 01:47 43,200,516 GSM7494325_AML18_REL_raw_matrix.mtx.gz
2023-06-16 01:47 33,194 GSM7494326_AML12_DX_processed_barcodes.tsv.gz
2023-06-16 01:47 84,192 GSM7494326_AML12_DX_processed_features.tsv.gz
2023-06-16 01:47 66,450,048 GSM7494326_AML12_DX_processed_matrix.mtx.gz
2023-06-16 01:47 207,398 GSM7494326_AML12_DX_processed_metadata.tsv.gz
2023-06-16 01:47 19,367,602 GSM7494326_AML12_DX_raw_barcodes.tsv.gz
2023-06-16 01:47 254,821 GSM7494326_AML12_DX_raw_features.tsv.gz
2023-06-16 01:48 98,958,970 GSM7494326_AML12_DX_raw_matrix.mtx.gz
2023-06-16 01:48 21,282 GSM7494327_AML12_REL_processed_barcodes.tsv.gz
2023-06-16 01:48 84,193 GSM7494327_AML12_REL_processed_features.tsv.gz
2023-06-16 01:48 38,509,535 GSM7494327_AML12_REL_processed_matrix.mtx.gz
2023-06-16 01:48 128,029 GSM7494327_AML12_REL_processed_metadata.tsv.gz
2023-06-16 01:48 19,367,603 GSM7494327_AML12_REL_raw_barcodes.tsv.gz
2023-06-16 01:48 254,822 GSM7494327_AML12_REL_raw_features.tsv.gz
2023-06-16 01:48 72,609,549 GSM7494327_AML12_REL_raw_matrix.mtx.gz
2023-06-16 01:48 15,382 GSM7494328_AML12_REM_processed_barcodes.tsv.gz
2023-06-16 01:48 84,193 GSM7494328_AML12_REM_processed_features.tsv.gz
2023-06-16 01:49 20,389,328 GSM7494328_AML12_REM_processed_matrix.mtx.gz
2023-06-16 01:49 90,294 GSM7494328_AML12_REM_processed_metadata.tsv.gz
2023-06-16 01:49 19,367,603 GSM7494328_AML12_REM_raw_barcodes.tsv.gz
2023-06-16 01:49 254,822 GSM7494328_AML12_REM_raw_features.tsv.gz
2023-06-16 01:49 40,579,509 GSM7494328_AML12_REM_raw_matrix.mtx.gz
2023-06-16 01:49 20,038 GSM7494329_AML13_DX_processed_barcodes.tsv.gz
2023-06-16 01:49 84,312 GSM7494329_AML13_DX_processed_features.tsv.gz
2023-06-16 01:49 23,394,461 GSM7494329_AML13_DX_processed_matrix.mtx.gz
2023-06-16 01:49 119,791 GSM7494329_AML13_DX_processed_metadata.tsv.gz
2023-06-16 01:49 19,367,602 GSM7494329_AML13_DX_raw_barcodes.tsv.gz
2023-06-16 01:49 254,821 GSM7494329_AML13_DX_raw_features.tsv.gz
2023-06-16 01:50 51,422,961 GSM7494329_AML13_DX_raw_matrix.mtx.gz
2023-06-16 01:50 29,948 GSM7494330_AML13_REL_processed_barcodes.tsv.gz
2023-06-16 01:50 84,313 GSM7494330_AML13_REL_processed_features.tsv.gz
2023-06-16 01:50 46,599,252 GSM7494330_AML13_REL_processed_matrix.mtx.gz
2023-06-16 01:50 184,833 GSM7494330_AML13_REL_processed_metadata.tsv.gz
2023-06-16 01:50 19,367,603 GSM7494330_AML13_REL_raw_barcodes.tsv.gz
2023-06-16 01:50 254,822 GSM7494330_AML13_REL_raw_features.tsv.gz
2023-06-16 01:50 70,788,361 GSM7494330_AML13_REL_raw_matrix.mtx.gz
2023-06-16 01:50 28,887 GSM7494331_AML13_REM_processed_barcodes.tsv.gz
2023-06-16 01:50 84,313 GSM7494331_AML13_REM_processed_features.tsv.gz
2023-06-16 01:50 24,651,210 GSM7494331_AML13_REM_processed_matrix.mtx.gz
2023-06-16 01:50 177,066 GSM7494331_AML13_REM_processed_metadata.tsv.gz
2023-06-16 01:51 19,367,603 GSM7494331_AML13_REM_raw_barcodes.tsv.gz
2023-06-16 01:51 254,822 GSM7494331_AML13_REM_raw_features.tsv.gz
2023-06-16 01:51 64,901,299 GSM7494331_AML13_REM_raw_matrix.mtx.gz
526 个文件 15,087,903,462 字节
2 个目录 211,161,419,776 可用字节
# 看一下其中一个features文件
!zcat GSM7494269_AML3_DX_raw_features.tsv.gz | head
ENSG00000243485 MIR1302-2HG ENSG00000237613 FAM138A ENSG00000186092 OR4F5 ENSG00000238009 AL627309.1 ENSG00000239945 AL627309.3 ENSG00000239906 AL627309.2 ENSG00000241599 AL627309.4 ENSG00000236601 AL732372.1 ENSG00000284733 OR4F29 ENSG00000235146 AC114498.1
# 明显看到该文件只有2列,通常cell ranger处理10x数据应该有三列,分别为ensemble,symbol,genename
# 改工作目录
os.chdir('../')
print("当前工作目录:", os.getcwd())
当前工作目录: C:\Users\Administrator\Desktop\test
files = os.listdir('./data') #获取目录中的所有文件和子目录,并将它们的名称存储在列表 files 中
import gzip
!mkdir temp
ffs = [x for x in files if 'feature' in x]
for ff in ffs:
# 打开原始 gzip 文件以读取
with gzip.open('./data' + ff, 'rt') as f_in:
# 创建一个新的 gzip 文件以写入
with gzip.open('./temp' + ff , 'wt') as f_out:
# 逐行读取原始文件内容
for line in f_in:
# 处理每一行并写入新文件
f_out.write(line.strip() + "\tGene Expression\n")
!zcat ./temp/GSM7494269_AML3_DX_raw_features.tsv.gz | head
ENSG00000243485 MIR1302-2HG Gene Expression ENSG00000237613 FAM138A Gene Expression ENSG00000186092 OR4F5 Gene Expression ENSG00000238009 AL627309.1 Gene Expression ENSG00000239945 AL627309.3 Gene Expression ENSG00000239906 AL627309.2 Gene Expression ENSG00000241599 AL627309.4 Gene Expression ENSG00000236601 AL732372.1 Gene Expression ENSG00000284733 OR4F29 Gene Expression ENSG00000235146 AC114498.1 Gene Expression
!mv ./temp/* ./data #将 temp 目录中的所有文件移动到 data 目录中
!zcat ./data/GSM7494269_AML3_DX_raw_features.tsv.gz | head
ENSG00000243485 MIR1302-2HG Gene Expression ENSG00000237613 FAM138A Gene Expression ENSG00000186092 OR4F5 Gene Expression ENSG00000238009 AL627309.1 Gene Expression ENSG00000239945 AL627309.3 Gene Expression ENSG00000239906 AL627309.2 Gene Expression ENSG00000241599 AL627309.4 Gene Expression ENSG00000236601 AL732372.1 Gene Expression ENSG00000284733 OR4F29 Gene Expression ENSG00000235146 AC114498.1 Gene Expression
Remove Ambient RNA¶
Ambient RNA refers to extraneous RNA molecules that are not originally from the sample of interest but are present in the experimental environment
We use CellBender in terminal,https://github.com/broadinstitute/CellBender
conda create -n cb python=3.7
conda activate cb
pip install cellbender
CellBender need h5 or h5ad files as input.
# sc.read_10x_mtx 函数。这个函数通常用于读取以 matrix.mtx, barcodes.tsv, 和 features.tsv 命名的文件
# prefix 参数用于指定文件名前缀
sc.read_10x_mtx('./data', prefix = 'GSM7494269_AML3_DX_raw_')
AnnData object with n_obs × n_vars = 6794880 × 33538
var: 'gene_ids', 'feature_types'
# batch reading the files and saving them as h5ad for cellranger
!mkdir raw_adata
# 筛选没有tar与processed的文件名
# 并以'_raw'为分割,挑选前面加上'_raw_'
# 使用set去重,确保每个前缀只出现一次
set([x.split('_raw')[0] + '_raw_' for x in files if 'processed' not in x and 'tar' not in x])
{'GSM7494257_AML16_DX_raw_',
'GSM7494258_AML16_REL_raw_',
'GSM7494259_AML16_REM_raw_',
'GSM7494260_AML6_DX_raw_',
'GSM7494261_AML6_REL_raw_',
'GSM7494262_AML6_REM_raw_',
'GSM7494263_AML2_DX_raw_',
'GSM7494264_AML2_REL_raw_',
'GSM7494265_AML2_REM_raw_',
'GSM7494266_AML15_DX_raw_',
'GSM7494267_AML15_REL_raw_',
'GSM7494268_AML15_REM_raw_',
'GSM7494269_AML3_DX_raw_',
'GSM7494270_AML3_REM_raw_',
'GSM7494271_AML7_DX_raw_',
'GSM7494272_AML7_REL_raw_',
'GSM7494273_AML7_REM_raw_',
'GSM7494274_AML8_DX_raw_',
'GSM7494275_AML8_REL_raw_',
'GSM7494276_AML8_REM_raw_',
'GSM7494277_AML20_DX_raw_',
'GSM7494278_AML20_REM_raw_',
'GSM7494279_AML5_DX_raw_',
'GSM7494280_AML5_REL_raw_',
'GSM7494281_AML5_REM_raw_',
'GSM7494282_AML17_DX_raw_',
'GSM7494283_AML17_REL_raw_',
'GSM7494284_AML1_REM_raw_',
'GSM7494285_AML1_DX_raw_',
'GSM7494286_AML27_DX_raw_',
'GSM7494287_AML27_REL_raw_',
'GSM7494288_AML27_REM_raw_',
'GSM7494289_AML9_DX_raw_',
'GSM7494290_AML9_REL_raw_',
'GSM7494291_AML9_REM_raw_',
'GSM7494292_AML10_DX_raw_',
'GSM7494293_AML10_REL_raw_',
'GSM7494294_AML10_REM_raw_',
'GSM7494295_AML11_DX_raw_',
'GSM7494296_AML11_REL_raw_',
'GSM7494297_AML11_REM_raw_',
'GSM7494298_AML4_DX_raw_',
'GSM7494299_AML4_REL_raw_',
'GSM7494300_AML22_DX_raw_',
'GSM7494301_AML22_REL_raw_',
'GSM7494302_AML22_REM_raw_',
'GSM7494303_AML21_DX_raw_',
'GSM7494304_AML21_REL_raw_',
'GSM7494305_AML21_REM_raw_',
'GSM7494306_AML24_DX_raw_',
'GSM7494307_AML24_REL_raw_',
'GSM7494308_AML24_REM_raw_',
'GSM7494309_AML23_DX_raw_',
'GSM7494310_AML23_REL_raw_',
'GSM7494311_AML23_REM_raw_',
'GSM7494312_AML28_REL_raw_',
'GSM7494313_AML28_REM_raw_',
'GSM7494314_AML14_DX_raw_',
'GSM7494315_AML14_REM_raw_',
'GSM7494316_AML25_DX_raw_',
'GSM7494317_AML25_REL_raw_',
'GSM7494318_AML25_REM_raw_',
'GSM7494319_AML26_DX_raw_',
'GSM7494320_AML26_REL_raw_',
'GSM7494321_AML26_REM_raw_',
'GSM7494322_AML19_DX_raw_',
'GSM7494323_AML19_REL_raw_',
'GSM7494324_AML18_DX_raw_',
'GSM7494325_AML18_REL_raw_',
'GSM7494326_AML12_DX_raw_',
'GSM7494327_AML12_REL_raw_',
'GSM7494328_AML12_REM_raw_',
'GSM7494329_AML13_DX_raw_',
'GSM7494330_AML13_REL_raw_',
'GSM7494331_AML13_REM_raw_'}
files = os.listdir('./data')
for prefix in set([x.split('_raw')[0] + '_raw_' for x in files if 'processed' not in x and 'tar' not in x]):
adata = sc.read_10x_mtx('./data', prefix = prefix)
adata.write_h5ad('raw_adata/' + prefix + '.h5ad')
!mkdir clean_adata
In terminal traverse h5ad files and remove AmbientRNA with cellbender:
cd clean_adata
for file in *h5ad; do cellbender remove-background
--input $file
--output ../clean_adata/$(basename $file .h5ad)denoised
--total-droplets-included 50000
--cuda;
done
#使用 $(basename "$file" .h5ad) 去除文件名中的 .h5ad 后缀,然后添加 denoised.h5ad 后缀
#--total-droplets-included 50000:指定总共包含的细胞滴度数量为 50000
#--cuda:使用 GPU 加速计算
会生成大量文件,其中一个为QC plot ,其中一幅图会根据umi去除environment RNA
What is the UMI?
UMI是一段短的、随机的核苷酸序列,在扩增之前添加到每个RNA或DNA分子上。UMI对于每个单独的分子都是独特的,这样研究人员可以区分原始分子和在扩增过程中产生的PCR重复。
What is the “Empty Droplet Plateau”?
“Empty droplet plateau” 是单细胞 RNA 测序数据分析中一个常见的现象。它指的是在 UMI (Unique Molecular Identifier) 计数分布中,一部分液滴的 UMI 计数非常低,这些液滴大多数是没有捕获到细胞的空液滴(empty droplets),而这些空液滴中的 UMI 主要来自于环境 RNA。
在单细胞 RNA 测序中,样品被分离成许多微小的液滴,每个液滴理论上包含一个细胞。然而,很多液滴实际上是空的,不包含任何细胞。这些空液滴中的 RNA 通常是从细胞破裂或其他来源进入液滴的环境 RNA。
当你绘制所有液滴的 UMI 计数分布图时,通常会看到一个平坦的区域(plateau),这就是所谓的“empty droplet plateau”。在这个区域内,大部分液滴的 UMI 计数非常低,主要由环境 RNA 组成。
import pandas as pd
#cellbender 处理后会出来一个metrics文件,其包含其过滤前后信息
#这段代码的功能是读取并处理多个 CSV 文件,将它们的数据框格式化并存储在一个列表中,每个数据框还包含了对应的文件名
metrics = []
for file in [x for x in os.listdir('clean_adata/') if x.endswith('metrics.csv')]:
_ = pd.read_csv('clean_adata/' + file,
header = None, names = ['Metric', 'Value']).set_index('Metric').T
_['File'] = file
metrics.append(_)
metrics = pd.concat(metrics).reset_index()
metrics
| Metric | index | total_raw_counts | total_output_counts | total_counts_removed | fraction_counts_removed | total_raw_counts_in_cells | total_counts_removed_from_cells | fraction_counts_removed_from_cells | average_counts_removed_per_cell | target_fpr | expected_cells | found_cells | output_average_counts_per_cell | ratio_of_found_cells_to_expected_cells | found_empties | fraction_of_analyzed_droplets_that_are_nonempty | convergence_indicator | overall_change_in_train_elbo | File |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Value | 32401298.0 | 31916568.0 | 484730.0 | 0.015 | 32401292.0 | 484724.0 | 0.015 | 60.613 | 0.01 | 8000.0 | 7997.0 | 3991.068 | 1.000 | 42003.0 | 0.160 | 0.210 | 818.798 | GSM7494294_AML10_REM_raw_denoised_metrics.csv |
| 1 | Value | 66959644.0 | 66108581.0 | 851063.0 | 0.013 | 66959660.0 | 851079.0 | 0.013 | 95.885 | 0.01 | 8000.0 | 8876.0 | 7448.015 | 1.109 | 41124.0 | 0.178 | 1.416 | 957.200 | GSM7494320_AML26_REL_raw_denoised_metrics.csv |
| 2 | Value | 98889072.0 | 97229381.0 | 1659691.0 | 0.017 | 98889080.0 | 1659699.0 | 0.017 | 161.985 | 0.01 | 8000.0 | 10246.0 | 9489.496 | 1.281 | 39754.0 | 0.205 | 0.773 | 2537.736 | GSM7494326_AML12_DX_raw_denoised_metrics.csv |
| 3 | Value | 6404987.0 | 6171930.0 | 233057.0 | 0.036 | 6404987.0 | 233057.0 | 0.036 | 24.717 | 0.01 | 8000.0 | 9429.0 | 654.569 | 1.179 | 40571.0 | 0.189 | 0.607 | 122.314 | GSM7494313_AML28_REM_raw_denoised_metrics.csv |
| 4 | Value | 40590968.0 | 39961889.0 | 629079.0 | 0.015 | 40590940.0 | 629051.0 | 0.015 | 58.773 | 0.01 | 8000.0 | 10703.0 | 3733.709 | 1.338 | 39297.0 | 0.214 | 0.966 | 1538.943 | GSM7494296_AML11_REL_raw_denoised_metrics.csv |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 70 | Value | 42653264.0 | 39980282.0 | 2672982.0 | 0.063 | 42653280.0 | 2672998.0 | 0.063 | 73.317 | 0.01 | 8000.0 | 36458.0 | 1096.612 | 4.557 | 13542.0 | 0.729 | 2.415 | 684.872 | GSM7494331_AML13_REM_raw_denoised_metrics.csv |
| 71 | Value | 40126384.0 | 39342411.0 | 783973.0 | 0.020 | 40126392.0 | 783981.0 | 0.020 | 67.104 | 0.01 | 8000.0 | 11683.0 | 3367.492 | 1.460 | 38317.0 | 0.234 | 0.958 | 743.470 | GSM7494283_AML17_REL_raw_denoised_metrics.csv |
| 72 | Value | 41402972.0 | 40733420.0 | 669552.0 | 0.016 | 41402952.0 | 669532.0 | 0.016 | 72.554 | 0.01 | 8000.0 | 9228.0 | 4414.111 | 1.153 | 40772.0 | 0.185 | 1.650 | 1387.417 | GSM7494288_AML27_REM_raw_denoised_metrics.csv |
| 73 | Value | 57972396.0 | 56787970.0 | 1184426.0 | 0.020 | 57972428.0 | 1184458.0 | 0.020 | 117.076 | 0.01 | 8000.0 | 10117.0 | 5613.123 | 1.265 | 39883.0 | 0.202 | 0.843 | 1011.199 | GSM7494264_AML2_REL_raw_denoised_metrics.csv |
| 74 | Value | 48269624.0 | 47613683.0 | 655941.0 | 0.014 | 48269636.0 | 655953.0 | 0.014 | 77.035 | 0.01 | 8000.0 | 8515.0 | 5591.742 | 1.064 | 41485.0 | 0.170 | 0.658 | 1288.932 | GSM7494312_AML28_REL_raw_denoised_metrics.csv |
75 rows × 19 columns
metrics.hist('fraction_counts_removed') #根据fraction_counts_removed这一列绘制直方图
#可以看出大部分为去除2%的rna,只有少部分去除9%rna,并不超过10%,比较符号预期
array([[<Axes: title={'center': 'fraction_counts_removed'}>]],
dtype=object)
QC¶
# 提取过滤好的h5ad文件
adatas = [x for x in os.listdir('clean_adata/') if x.Ends with('filtered.h5')] if x
#定义一个函数,这个 load_it 函数用于加载指定的单细胞数据文件,提取并添加样本信息,然后返回更新后的 AnnData 对象。
def load_it(adata):
samp = adata.split('_')[1]
dx = adata.split('_')[2]
adata = sc.read_10x_h5('clean_adata/' + adata)
adata.obs['Patient'] = samp
adata.obs['DX'] = dx
adata.obs['Sample'] = adata.obs['Patient'] + '_' + adata.obs['DX']
adata.obs.index = adata.obs.index + '-' + samp + '_' + dx
return adata
adatas = [load_it(ad) for ad in adatas]
adatas
def qc(adata):
sc.pp.filter_cells(adata, min_genes = 200) #过滤掉基因数少于200的细胞
adata.var["mt"] = adata.var_names.str.startswith("MT-") #标记线粒体基因(MT-),注意物种大小写
adata.var["ribo"] = adata.var_names.str.startswith(("RPS", "RPL")) #标记核糖体蛋白基因(RPS和RPL),注意物种大小写
adata.var["hb"] = adata.var_names.str.contains(("^HB[^(P)]")) #标记血红蛋白基因(HB但不包括HBP),注意物种大小写。
sc.pp.calculate_qc_metrics(adata, qc_vars=["mt", "ribo", "hb"], inplace=True, percent_top=[20], log1p=True)
#计算QC指标,包括线粒体基因、核糖体蛋白基因和血红蛋白基因的百分比
remove = ['total_counts_mt', 'log1p_total_counts_mt', 'total_counts_ribo',
'log1p_total_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb']
adata.obs = adata.obs[[x for x in adata.obs.columns if x not in remove]]
#这里移除了不需要的QC指标,以简化数据集。
return adata
adatas = [qc(ad) for ad in adatas]
# traverse adatas里的x.obs dataframe,并合并一个表格
# pd.concat 是 pandas 库中的一个函数,用于沿特定轴连接 pandas对象
df = pd.concat([x.obs for x in adatas])
#根据样本名进行sort
df = df.sort_values('Sample')
#value = "pct_counts_mt"
#value = "n_genes"
#value = 'pct_counts_in_top_20_genes' 如果一个好的data,其前含量最多20个基因的count数占比不会太大
# 定义要绘制的值
value = "log1p_total_counts"
# 设置 seaborn 的绘图样式,使用白色主题,并设置轴背景颜色为透明
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
# 创建一个 FacetGrid 对象,将数据根据 "Sample" 列分行,并使用 "tab20" 调色板。
# hue="Sample"按sample进行颜色分类
# aspect=15:这设置了每个子图的宽高比。具体来说,宽度是高度的 15 倍。这个参数可以用来控制子图的形状。
g = sns.FacetGrid(df, row="Sample", hue="Sample", aspect=15, height=0.5, palette="tab20")
# sns.kdeplot: kdeplot 用于绘制核密度估计图。这种图显示了一个连续变量的概率密度函数
# value: 这是要绘制的变量名。在之前的代码中,value 被设定为 "log1p_total_counts"
# clip_on=False,clip_on 设置为 False,绘图将不会被剪辑,这意味着即使绘图超出轴的范围,它仍然会被显示出来。
# fill=True: 如果 fill 设置为 True,则核密度估计图下方的区域将被填充
# alpha=1:这个参数设置绘图的透明度
# linewidth=1.5:这个参数设置绘图线条的宽度
g.map(sns.kdeplot, value, clip_on=False, fill=True, alpha=1, linewidth=1.5)
# 绘制宽度为2的白色边框
g.map(sns.kdeplot, value, clip_on=False, color="w", lw=2)
#在每个子图的 y=0 位置绘制一条水平线,线宽为 2
g.map(plt.axhline, y=0, lw=2, clip_on=False)
#定义一个函数,在每个子图中添加标签。标签文本位置为 (0, 0.2),颜色与 hue 颜色相同,字体加粗。
# ha="left" 表示水平对齐方式为左对齐。
# va="center" 表示垂直对齐方式为居中。
# transform=ax.transAxes 将坐标系设置为轴坐标系,即 (0, 0) 是左下角,(1, 1) 是右上角
def label(x, color, label):
ax = plt.gca()
ax.text(0, .2, label, fontweight="bold", color=color,
ha="left", va="center", transform=ax.transAxes)
g.map(label, value)
#调整子图之间的垂直间距,使它们重叠。
g.figure.subplots_adjust(hspace=-.6)
#移除标题和 y 轴标签
g.set_titles("")
g.set(yticks=[], ylabel="")
#去除图的底部和左侧边框
g.despine(bottom=True, left=True)
#为每个子图添加垂直中位数线
for ax in g.axes.flat:
ax.axvline(x=df[value].median(), color='r', linestyle='-')
plt.show()
Preprocessing (removing outliers and doublets)¶
Alternative doublet dection method: https://github.com/mousepixels/sanbomics_scripts/blob/main/doublet_removal_SOLO_scVI.ipynb
import doubletdetection
# median_abs_deviation(中位数绝对偏差)是一个统计量,用于衡量数据集的离散程度。相比标准差,它对异常值(离群值)更为稳健
# median_abs_deviation 可以用于检测和移除离群值
from scipy.stats import median_abs_deviation as mad
import numpy as np
Processing a sample¶
a = df[df.Sample == 'AML10_REM'].log1p_total_counts
# 取中位数
np.median(a)
8.897545514588673
取阈值,这个阈值是中位数加减去 5 倍的中位数绝对偏差(mad)
MAD=median(∣ai−median(a)∣)
np.median(a) - 5 * mad(a)
6.3832497277383995
np.median(a) + 5 * mad(a)
11.411841301438947
# 绘制数据分布图
ax = sns.displot(a)
plt.axvline(np.median(a) - 5 * mad(a))
plt.axvline(np.median(a) + 5 * mad(a))
plt.show()
you can change threshold according to each sample
Processing all samples¶
## 定义一个mad_outlier函数
# adata:AnnData 对象,通常用于单细胞 RNA 测序数据。
# metric:字符串,表示 adata.obs 中的一列,该列包含要检测离群值的度量值。
# nmads:用于确定离群值的倍数。即中位数绝对偏差(MAD)的倍数。
# 如果 upper_only 为 False,则检测低于和高于中位数加减 nmads 倍 MAD 的离群值
# 如果 upper_only 为 True,则仅检测高于中位数加 nmads 倍 MAD 的离群值。
def mad_outlier(adata, metric, nmads, upper_only = False):
M = adata.obs[metric]
if not upper_only:
return (M < np.median(M) - nmads * mad(M)) | (M > np.median(M) + nmads * mad(M))
return (M > np.median(M) + nmads * mad(M))
# 创建一个 BoostClassifier 对象并将其赋值给变量 clf,用于检测双重子(doublets)
clf = doubletdetection.BoostClassifier(
n_iters=10, #训练过程中使用的迭代次数为 10
clustering_algorithm="louvain", #指定使用 Louvain 聚类算法
standard_scaling=True, #指定在训练前对数据进行标准化处理。这通常涉及将数据调整到具有零均值和单位方差,以便不同特征在同一尺度上。
pseudocount=0.1, #添加一个伪计数,以避免在计算对数变换时出现零值
n_jobs=-1) #指定使用所有可用的 CPU 核心进行并行计算。如果设置为 -1,则使用机器上的所有处理器。
# 创建一个preprocessing函数
def pp(adata):
# 移除线粒体基因表达比例大于 25% 的细胞
adata = adata[adata.obs.pct_counts_mt < 25] #you can lower this based on the overal distribution of your dataset
# 通过中位数绝对偏差 (MAD) 检测离群值
# 将检测到的离群值都会返回一个布尔向量,将其相加,所有为F才为F
# 例如T+T+T+F=T,F+F+F+F=F
bool_vector = mad_outlier(adata, 'log1p_total_counts', 5) +\
mad_outlier(adata, 'log1p_n_genes_by_counts', 5) +\
mad_outlier(adata, 'pct_counts_in_top_20_genes', 5) +\
mad_outlier(adata, 'pct_counts_mt', 3, upper_only = True) # 对于线粒体,只需要筛选高线粒体的细胞就行,有些细胞低线粒体基因不需要筛选掉
# 移除离群值
adata = adata[~bool_vector]
# 记录移除的细胞数
adata.uns['cells_removed'] = sum(bool_vector)
# 检测双重子
# fit 方法对数据进行训练,predict 方法使用给定的阈值进行预测。
# doublet_score 方法返回每个细胞的双重子得分。
# p_thresh=1e-3:预测概率的阈值。如果一个细胞被预测为双重子的概率高于 1e-3,它将被标记为双重子。通过调整此,来改变去除双重子的数量占比
# voter_thresh=0.5:投票器的阈值。如果一个细胞的投票得分高于 0.5,它将被标记为双重子。
doublets = clf.fit(adata.X).predict(p_thresh=1e-3, voter_thresh=0.5)
doublet_score = clf.doublet_score()
#将双重子信息添加到 AnnData 对象中,1为doublet,o不为doublet.
adata.obs["doublet"] = doublets
adata.obs["doublet_score"] = doublet_score
#记录移除的双重子数:
adata.uns['doublets_removed'] = adata.obs.doublet.sum()
# 通过筛选 adata.obs.doublet 列中值为 0 的细胞,移除双重子
adata = adata[adata.obs.doublet == 0]
return adata
adatas = [pp(ad) for ad in adatas]
Note that these data were a little noisy and a higher proportion of cells were removed than what would be expected from a "clean" dataset
If the number of cells removed in MAD filtering is very large relative to overall cells, it may be worthwhile to increase thresholds or use a less conservative method. ~5-15% is typical.
Expected doublet rate is typically ~2-10% depending on the number of cells sequenced for 10x workflows
If anything deviates drastically, visit that sample specifically.
# 查看去除的细胞,2%-10%为合适
for adata in adatas:
print(len(adata), adata.uns['cells_removed'], adata.uns['doublets_removed'])
1693 1186 91.0 2760 1837 283.0 3598 1116 369.0 3500 577 306.0 7886 3458 4297.0 2242 619 129.0 2874 1018 355.0 2888 1172 427.0 3630 1207 337.0 6768 984 721.0 6025 981 830.0 4178 437 564.0 5688 793 511.0 3332 999 441.0 4086 1122 475.0 1596 655 36.0 5016 781 993.0 5247 715 529.0 2314 892 84.0 3388 533 495.0 5065 684 423.0 3287 1310 481.0 1728 317 46.0 1643 504 48.0 1326 448 59.0 5376 856 604.0 6412 767 997.0 3553 1184 548.0 4207 984 308.0 5366 1051 486.0 5292 1216 1046.0 4182 434 524.0 5347 1461 490.0 7798 729 998.0 4079 922 321.0 6639 1011 1068.0 2352 418 166.0 4296 884 243.0 3088 684 458.0 5858 949 633.0 4889 525 369.0 4662 812 466.0 3845 566 379.0 3576 1688 336.0 4459 620 366.0 6080 1292 956.0 3307 798 341.0 5438 1495 1086.0 3486 807 508.0 5529 1432 554.0 1240 299 52.0 3983 769 466.0 4497 799 282.0 5776 1131 561.0 1501 212 65.0 5533 961 563.0 6345 1258 567.0 5766 860 590.0 2057 937 111.0 6567 1140 935.0 5931 1063 519.0 2803 861 204.0 2512 956 333.0 2531 590 128.0 4506 755 552.0 2269 1039 89.0 1817 456 114.0 4417 818 386.0 4231 711 318.0 1005 119 17.0 79 11 0.0 3784 758 554.0 2406 591 170.0 4854 759 572.0 3065 532 202.0
对于不同细胞数量的测序,去除的细胞数不一样,通常细胞数越多,去除的细胞占比越多
## if you wanted to look at distributions after pp (df is still saved as prior):
df2 = pd.concat([x.obs for x in adatas])
df2 = df2.sort_values('Sample')
#value = "pct_counts_mt"
#value = "n_genes"
value = 'pct_counts_in_top_20_genes'
#value = "log1p_total_counts"
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
g = sns.FacetGrid(df2, row="Sample", hue="Sample", aspect=15, height=0.5, palette="tab20")
g.map(sns.kdeplot, value, clip_on=False, fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, value, clip_on=False, color="w", lw=2)
g.map(plt.axhline, y=0, lw=2, clip_on=False)
def label(x, color, label):
ax = plt.gca()
ax.text(0, .2, label, fontweight="bold", color=color,
ha="left", va="center", transform=ax.transAxes)
g.map(label, value)
g.figure.subplots_adjust(hspace=-.6)
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)
for ax in g.axes.flat:
ax.axvline(x=df2[value].median(), color='r', linestyle='-')
plt.show()
Loading modules¶
import scanpy as sc
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scvi
C:\ProgramData\anaconda3\envs\sc2024\lib\site-packages\scvi\__init__.py:31: DeprecationWarning: scvi is deprecated, please uninstall scvi via `pip uninstall scvi` and install the new scvi-tools package at github.com/YosefLab/scvi-tools warnings.warn(deprecation_msg, DeprecationWarning)
# 安装必要的库
# celltypist:一个用于自动细胞类型注释的工具,基于已知的细胞类型基因表达数据。
# scvi-tools:一个用于单细胞变分推理的工具包,基于深度学习,用于单细胞基因表达数据的分析。
# hyperopt:一个用于超参数优化的 Python 库,可以通过贝叶斯优化来寻找最佳超参数组合。
# ray[tune]:ray 是一个用于分布式计算的框架,tune 是 ray 中的一个模块,用于超参数搜索和优化。
# anndata2ri:一个用于在 R 和 Python 之间转换 AnnData 对象的库,方便在两种语言之间共享单细胞 RNA 测序数据。
#%pip install celltypist scvi-tools hyperopt "ray[tune]" anndata2ri
import warnings
warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter("ignore", RuntimeWarning)
Creating celltypist models¶
import celltypist #celltypist 是一个用于细胞类型注释的工具
from celltypist import models #专门用于管理和操作细胞类型注释的预训练模型。它提供了下载、加载和使用预训练模型的功能。
Models owned by cellpiist¶
#可以查看celltipist自带的模型
models.get_all_models()
['Adult_COVID19_PBMC.pkl', 'Adult_CynomolgusMacaque_Hippocampus.pkl', 'Adult_Human_PancreaticIslet.pkl', 'Adult_Human_Skin.pkl', 'Adult_Mouse_Gut.pkl', 'Adult_Mouse_OlfactoryBulb.pkl', 'Adult_Pig_Hippocampus.pkl', 'Adult_RhesusMacaque_Hippocampus.pkl', 'Autopsy_COVID19_Lung.pkl', 'Cells_Adult_Breast.pkl', 'Cells_Fetal_Lung.pkl', 'Cells_Human_Tonsil.pkl', 'Cells_Intestinal_Tract.pkl', 'Cells_Lung_Airway.pkl', 'COVID19_HumanChallenge_Blood.pkl', 'COVID19_Immune_Landscape.pkl', 'Developing_Human_Brain.pkl', 'Developing_Human_Gonads.pkl', 'Developing_Human_Hippocampus.pkl', 'Developing_Human_Organs.pkl', 'Developing_Human_Thymus.pkl', 'Developing_Mouse_Brain.pkl', 'Developing_Mouse_Hippocampus.pkl', 'Fetal_Human_AdrenalGlands.pkl', 'Fetal_Human_Pancreas.pkl', 'Fetal_Human_Pituitary.pkl', 'Fetal_Human_Retina.pkl', 'Fetal_Human_Skin.pkl', 'Healthy_Adult_Heart.pkl', 'Healthy_COVID19_PBMC.pkl', 'Healthy_Human_Liver.pkl', 'Healthy_Mouse_Liver.pkl', 'Human_AdultAged_Hippocampus.pkl', 'Human_Colorectal_Cancer.pkl', 'Human_Developmental_Retina.pkl', 'Human_Embryonic_YolkSac.pkl', 'Human_IPF_Lung.pkl', 'Human_Longitudinal_Hippocampus.pkl', 'Human_Lung_Atlas.pkl', 'Human_PF_Lung.pkl', 'Human_Placenta_Decidua.pkl', 'Immune_All_High.pkl', 'Immune_All_Low.pkl', 'Lethal_COVID19_Lung.pkl', 'Mouse_Dentate_Gyrus.pkl', 'Mouse_Isocortex_Hippocampus.pkl', 'Mouse_Postnatal_DentateGyrus.pkl', 'Mouse_Whole_Brain.pkl', 'Nuclei_Lung_Airway.pkl', 'Pan_Fetal_Human.pkl']
Train model with refdata¶
我们还可以下载网上的数据,完善注释训练模型
示例数据下载链接:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse116256
解压后放至ref_data文件夹
print("当前工作目录:", os.getcwd())
当前工作目录: C:\Users\Administrator\Desktop\test
# 其中dem为稀疏矩阵,anno为注释文件
os.listdir('ref_data/')
['GSM3587923_AML1012-D0.dem.txt.gz', 'GSM3587924_AML1012-D0.anno.txt.gz', 'GSM3587925_AML210A-D0.dem.txt.gz', 'GSM3587926_AML210A-D0.anno.txt.gz', 'GSM3587927_AML314-D0.dem.txt.gz', 'GSM3587928_AML314-D0.anno.txt.gz', 'GSM3587929_AML314-D31.dem.txt.gz', 'GSM3587930_AML314-D31.anno.txt.gz', 'GSM3587931_AML328-D0.dem.txt.gz', 'GSM3587932_AML328-D0.anno.txt.gz', 'GSM3587933_AML328-D113.dem.txt.gz', 'GSM3587934_AML328-D113.anno.txt.gz', 'GSM3587935_AML328-D171.dem.txt.gz', 'GSM3587936_AML328-D171.anno.txt.gz', 'GSM3587937_AML328-D29.dem.txt.gz', 'GSM3587938_AML328-D29.anno.txt.gz', 'GSM3587939_AML328.nanopore.txt.gz', 'GSM3587940_AML329-D0.dem.txt.gz', 'GSM3587941_AML329-D0.anno.txt.gz', 'GSM3587942_AML329-D20.dem.txt.gz', 'GSM3587943_AML329-D20.anno.txt.gz', 'GSM3587944_AML329-D37.dem.txt.gz', 'GSM3587945_AML329-D37.anno.txt.gz', 'GSM3587946_AML371-D0.dem.txt.gz', 'GSM3587947_AML371-D0.anno.txt.gz', 'GSM3587948_AML371-D34.dem.txt.gz', 'GSM3587949_AML371-D34.anno.txt.gz', 'GSM3587950_AML419A-D0.dem.txt.gz', 'GSM3587951_AML419A-D0.anno.txt.gz', 'GSM3587952_AML419A.nanopore.txt.gz', 'GSM3587953_AML420B-D0.dem.txt.gz', 'GSM3587954_AML420B-D0.anno.txt.gz', 'GSM3587955_AML420B-D14.dem.txt.gz', 'GSM3587956_AML420B-D14.anno.txt.gz', 'GSM3587957_AML420B-D35.dem.txt.gz', 'GSM3587958_AML420B-D35.anno.txt.gz', 'GSM3587959_AML475-D0.dem.txt.gz', 'GSM3587960_AML475-D0.anno.txt.gz', 'GSM3587961_AML475-D29.dem.txt.gz', 'GSM3587962_AML475-D29.anno.txt.gz', 'GSM3587963_AML556-D0.dem.txt.gz', 'GSM3587964_AML556-D0.anno.txt.gz', 'GSM3587965_AML556-D15.dem.txt.gz', 'GSM3587966_AML556-D15.anno.txt.gz', 'GSM3587967_AML556-D31.dem.txt.gz', 'GSM3587968_AML556-D31.anno.txt.gz', 'GSM3587969_AML707B-D0.dem.txt.gz', 'GSM3587970_AML707B-D0.anno.txt.gz', 'GSM3587971_AML707B-D113.dem.txt.gz', 'GSM3587972_AML707B-D113.anno.txt.gz', 'GSM3587973_AML707B-D18.dem.txt.gz', 'GSM3587974_AML707B-D18.anno.txt.gz', 'GSM3587975_AML707B-D41.dem.txt.gz', 'GSM3587976_AML707B-D41.anno.txt.gz', 'GSM3587977_AML707B-D97.dem.txt.gz', 'GSM3587978_AML707B-D97.anno.txt.gz', 'GSM3587979_AML707B.nanopore.txt.gz', 'GSM3587980_AML722B-D0.dem.txt.gz', 'GSM3587981_AML722B-D0.anno.txt.gz', 'GSM3587982_AML722B-D49.dem.txt.gz', 'GSM3587983_AML722B-D49.anno.txt.gz', 'GSM3587984_AML870-D0.dem.txt.gz', 'GSM3587985_AML870-D0.anno.txt.gz', 'GSM3587986_AML870-D14.dem.txt.gz', 'GSM3587987_AML870-D14.anno.txt.gz', 'GSM3587988_AML916-D0.dem.txt.gz', 'GSM3587989_AML916-D0.anno.txt.gz', 'GSM3587990_AML921A-D0.dem.txt.gz', 'GSM3587991_AML921A-D0.anno.txt.gz', 'GSM3587992_AML997-D0.dem.txt.gz', 'GSM3587993_AML997-D0.anno.txt.gz', 'GSM3587994_AML997-D35.dem.txt.gz', 'GSM3587995_AML997-D35.anno.txt.gz', 'GSM3587996_BM1.anno.txt.gz', 'GSM3587996_BM1.dem.txt.gz', 'GSM3587997_BM2.anno.txt.gz', 'GSM3587997_BM2.dem.txt.gz', 'GSM3587998_BM3.dem.txt.gz', 'GSM3587999_BM3.anno.txt.gz', 'GSM3588000_BM4.dem.txt.gz', 'GSM3588001_BM4.anno.txt.gz', 'GSM3588002_BM5-34p.anno.txt.gz', 'GSM3588002_BM5-34p.dem.txt.gz', 'GSM3588003_BM5-34p38n.anno.txt.gz', 'GSM3588003_BM5-34p38n.dem.txt.gz', 'GSM3588004_MUTZ3.anno.txt.gz', 'GSM3588004_MUTZ3.dem.txt.gz', 'GSM3588005_OCI-AML3.anno.txt.gz', 'GSM3588005_OCI-AML3.dem.txt.gz']
# 遍历 ref_data/ 目录中的所有文件,读取包含 dem 和 anno 的文件合并,并存储在 rdatas 列表中。
rdatas = []
for dem in [x for x in os.listdir('ref_data/') if 'dem' in x]:
basename = dem.split('.')[0]
samp = basename.split('_')[1]
# 在 ref_data/ 目录中查找包含 'anno' 字符串并且与 dem 文件对应的注释文件。这里通过比较样本名称来找到匹配的注释文件
anno = [x for x in os.listdir('ref_data/') if samp == x.split('_')[1].split('.')[0] and 'anno' in x][0]
#读取表达矩阵,并转置
temp_data = sc.read_text('ref_data/' + dem).T
#读取注释文件
temp_anno = pd.read_table('ref_data/' + anno, index_col=0)[['CellType']]
#合并注释与表达矩阵
temp_data.obs = temp_data.obs.merge(right = temp_anno, left_index=True, right_index=True)
temp_data.obs['ID'] = basename
#将处理后的数据添加到列表中
rdatas.append(temp_data)
# scanpy 的 concat 函数会根据 obs 和 var 的索引来合并多个 AnnData 对象
rdata = sc.concat(rdatas)
#按 CellType 列进行分组,并计算每个细胞类型的数量
rdata.obs.groupby('CellType').size()
CellType B 520 CTL 1338 GMP 918 GMP-like 3055 HSC 1709 HSC-like 1979 Mono 2758 Mono-like 2655 NK 1969 Plasma 1146 ProB 298 ProMono 1146 ProMono-like 1917 Prog 1709 Prog-like 4438 T 7105 cDC 849 cDC-like 2125 earlyEry 1119 lateEry 1329 pDC 225 dtype: int64
# 查看细胞注释信息
rdata.obs
| CellType | ID | |
|---|---|---|
| AML1012-D0_AAAAAGTTACGT | GMP | GSM3587923_AML1012-D0 |
| AML1012-D0_AAAACACCAATC | GMP-like | GSM3587923_AML1012-D0 |
| AML1012-D0_AAAATAGCCTTT | Prog-like | GSM3587923_AML1012-D0 |
| AML1012-D0_AAACATTAAACG | ProMono-like | GSM3587923_AML1012-D0 |
| AML1012-D0_AAACCACGTGCN | Prog-like | GSM3587923_AML1012-D0 |
| ... | ... | ... |
| OCI-AML3_TTTGAGCCAACC | GMP-like | GSM3588005_OCI-AML3 |
| OCI-AML3_TTTGCCACAGGC | ProMono-like | GSM3588005_OCI-AML3 |
| OCI-AML3_TTTTATGAATTN | GMP-like | GSM3588005_OCI-AML3 |
| OCI-AML3_TTTTGACTTTTN | GMP-like | GSM3588005_OCI-AML3 |
| OCI-AML3_TTTTGTGACCGT | GMP-like | GSM3588005_OCI-AML3 |
41090 rows × 2 columns
#表示只保留那些在至少 10 个细胞中表达的基因
sc.pp.filter_genes(rdata, min_cells = 10)
# 数据标准化
# target_sum=1e4:目标总和,将每个细胞的总表达量标准化为 10000(1e4)
sc.pp.normalize_total(rdata, target_sum = 1e4) #Note this is only for cell annotation, recommended by authors but not best
# 对数变换,即 log(1 + x)
sc.pp.log1p(rdata)
# 过滤掉 rdata.obs 中 CellType 列值为 NaN(缺失值)的细胞
# ~:逻辑非运算符,反转 isna() 的布尔值
rdata = rdata[~rdata.obs.CellType.isna()]
rdata
View of AnnData object with n_obs × n_vars = 40307 × 19616
obs: 'CellType', 'ID'
var: 'n_cells'
uns: 'log1p'
ref_model = celltypist.train(rdata, #这是经过预处理和标准化后的 AnnData 对象,包含了细胞的基因表达数据和细胞类型注释。
labels = 'CellType',
n_jobs = 8, #指定用于训练的并行处理的线程数。这里使用了 22 个线程以加速训练过程
use_SGD = True, #指定是否使用随机梯度下降 (SGD) 优化算法。如果设置为 False,则使用默认的优化算法
feature_selection = True, #指定是否进行特征选择。特征选择可以提高模型的性能和训练速度
top_genes = 300) #选择前 300 个基因作为特征 这个参数决定了用于模型训练的特征数量
🍳 Preparing data before training 🔬 Input data has 40307 cells and 19616 genes ⚖️ Scaling input data 🏋️ Training data using SGD logistic regression 🔎 Selecting features 🧬 2500 features are selected 🏋️ Starting the second round of training 🏋️ Training data using SGD logistic regression ✅ Model training done!
随机梯度下降 (SGD)?
SGD (Stochastic Gradient Descent):
是一种常用的优化算法,特别适用于大规模数据集的机器学习模型训练。
每次只使用一个或几个样本进行梯度计算和参数更新,而不是整个数据集,从而加快训练速度和提高计算效率。
由于其快速和有效的特性,SGD 常用于深度学习和大数据集的训练中
不使用 SGD(默认优化算法):在某些情况下,默认优化算法可能更适合,具体取决于数据的性质和模型的复杂性。
使用 SGD:在大数据集或需要快速训练的情况下,SGD 可能更为高效。
当然了,我们为了速度快,理所应当的要付出一些代价,不像梯度下降法那样每次更新都会朝着Loss不断减小的方向去移动,最后收敛于极值点(凸函数收敛于全局极值点,非凸函数可能会收敛于局部极值点),SGD由于每次选择样本的随机性,会有些许波动,也就是下图的SGD收敛示意图,走的路会比较曲折,有时候会从一个点突然跳到另外一个点去,不过这样也有好处,因为对于非凸Loss函数,我们用梯度下降法很可能收敛在在局部极值点就不动了,但是SGD却能用它的随机选择样本更新梯度的特性跳出局部极值点,很可能在非凸Loss函数中找到全局极值点。
## 写入模型到celltypist目录下
ref_model.write('C:/Users/Administrator/.celltypist/data/models/ref.pkl')
#查看发现多出我们训练的模型
models.get_all_models()
['Adult_COVID19_PBMC.pkl', 'Adult_CynomolgusMacaque_Hippocampus.pkl', 'Adult_Human_PancreaticIslet.pkl', 'Adult_Human_Skin.pkl', 'Adult_Mouse_Gut.pkl', 'Adult_Mouse_OlfactoryBulb.pkl', 'Adult_Pig_Hippocampus.pkl', 'Adult_RhesusMacaque_Hippocampus.pkl', 'Autopsy_COVID19_Lung.pkl', 'Cells_Adult_Breast.pkl', 'Cells_Fetal_Lung.pkl', 'Cells_Human_Tonsil.pkl', 'Cells_Intestinal_Tract.pkl', 'Cells_Lung_Airway.pkl', 'COVID19_HumanChallenge_Blood.pkl', 'COVID19_Immune_Landscape.pkl', 'Developing_Human_Brain.pkl', 'Developing_Human_Gonads.pkl', 'Developing_Human_Hippocampus.pkl', 'Developing_Human_Organs.pkl', 'Developing_Human_Thymus.pkl', 'Developing_Mouse_Brain.pkl', 'Developing_Mouse_Hippocampus.pkl', 'Fetal_Human_AdrenalGlands.pkl', 'Fetal_Human_Pancreas.pkl', 'Fetal_Human_Pituitary.pkl', 'Fetal_Human_Retina.pkl', 'Fetal_Human_Skin.pkl', 'Healthy_Adult_Heart.pkl', 'Healthy_COVID19_PBMC.pkl', 'Healthy_Human_Liver.pkl', 'Healthy_Mouse_Liver.pkl', 'Human_AdultAged_Hippocampus.pkl', 'Human_Colorectal_Cancer.pkl', 'Human_Developmental_Retina.pkl', 'Human_Embryonic_YolkSac.pkl', 'Human_IPF_Lung.pkl', 'Human_Longitudinal_Hippocampus.pkl', 'Human_Lung_Atlas.pkl', 'Human_PF_Lung.pkl', 'Human_Placenta_Decidua.pkl', 'Immune_All_High.pkl', 'Immune_All_Low.pkl', 'Lethal_COVID19_Lung.pkl', 'Mouse_Dentate_Gyrus.pkl', 'Mouse_Isocortex_Hippocampus.pkl', 'Mouse_Postnatal_DentateGyrus.pkl', 'Mouse_Whole_Brain.pkl', 'Nuclei_Lung_Airway.pkl', 'Pan_Fetal_Human.pkl', 'ref.pkl']
How to use RDS data as train data ?¶
# %pip install anndata2ri
# anndata2ri 包试图使用了已经被移除的 np.float_ 属性,而在NumPy 2.0 中,np.float_ 被替换为 np.float64
# 建议更新anndata2ri 包修复此问题
# 或安装旧的NumPy:pip install --force-reinstall numpy==1.26.4
# 导入 anndata2ri 并激活
from anndata2ri import activate
activate()
# 加载 rpy2 扩展
%reload_ext rpy2.ipython
# 现在你可以在 Notebook 中使用 R 代码块了
C:\Users\Administrator\AppData\Local\Temp\ipykernel_21456\2347486068.py:3: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter. activate() C:\ProgramData\anaconda3\envs\sc2024\lib\site-packages\rpy2\robjects\packages.py:367: UserWarning: The symbol 'quartz' is not in this R namespace/package. warnings.warn(
如何解决Python调用R出现“UnicodeDecodeError: ‘utf-8‘ codec can‘t decode byte 0xb2” 问题 https://blog.csdn.net/qq_44645101/article/details/127069531
#%%R
#install.packages('Seurat')
#BiocManager::install("SummarizedExperiment")
%%R
library(Seurat)
library(SummarizedExperiment)
WARNING: The R package "reticulate" only fixed recently
an issue that caused a segfault when used with rpy2:
https://github.com/rstudio/reticulate/pull/1188
Make sure that you use a version of that package that includes
the fix.
Attaching SeuratObject
载入需要的程辑包:MatrixGenerics
载入需要的程辑包:matrixStats
载入程辑包:'MatrixGenerics'
The following objects are masked from 'package:matrixStats':
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
colWeightedMeans, colWeightedMedians, colWeightedSds,
colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
rowWeightedSds, rowWeightedVars
载入需要的程辑包:GenomicRanges
载入需要的程辑包:stats4
载入需要的程辑包:BiocGenerics
载入程辑包:'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which.max, which.min
载入需要的程辑包:S4Vectors
载入程辑包:'S4Vectors'
The following objects are masked from 'package:base':
expand.grid, I, unname
载入需要的程辑包:IRanges
载入程辑包:'IRanges'
The following object is masked from 'package:grDevices':
windows
载入需要的程辑包:GenomeInfoDb
载入需要的程辑包:Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
载入程辑包:'Biobase'
The following object is masked from 'package:MatrixGenerics':
rowMedians
The following objects are masked from 'package:matrixStats':
anyMissing, rowMedians
载入程辑包:'SummarizedExperiment'
The following object is masked from 'package:SeuratObject':
Assays
The following object is masked from 'package:Seurat':
Assays
In addition: Warning messages:
1: 程辑包'Seurat'是用R版本4.2.3 来建造的
2: 程辑包'matrixStats'是用R版本4.2.3 来建造的
%%R -o healthy #将 R 变量 healthy 输出到 Python 环境中
rse <- readRDS("ref_data2/scRNA-Healthy-Hematopoiesis-191120.rds") #读取rds data
seurat_object <- CreateSeuratObject(counts = assay(rse, "counts"), meta.data = as.data.frame(colData(rse))) #create seurat object
healthy <- as.SingleCellExperiment(seurat_object) #将 Seurat 对象转换为 SingleCellExperiment 对象
In addition: Warning messages: 1: Layer ‘data’ is empty 2: Layer ‘scale.data’ is empty
Different from SingleCellExperiment object and seurat object ?
Seurat:主要用于数据分析和可视化,有许多高级功能和算法,适合完整的单细胞分析工作流。
SingleCellExperiment:主要用于数据存储和管理,提供一个标准化的数据结构,便于在不同 Bioconductor 包之间交换数据。
# 查看细胞注释信息
healthy.obs
| orig.ident | nCount_RNA | nFeature_RNA | Group | nUMI_pre | nUMI | nGene | initialClusters | UMAP1 | UMAP2 | Clusters | BioClassification | Barcode | ident | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| CD34_32_R5:AAACCTGAGTATCGAA-1 | CD34 | 8303.0 | 3187 | CD34_D2T1 | 17876.0 | 8303.0 | 3187.0 | Cluster1 | -6.113410 | 4.616498 | Cluster5 | 05_CMP.LMPP | AAACCTGAGTATCGAA-1 | CD34 |
| CD34_32_R5:AAACCTGAGTCGTTTG-1 | CD34 | 3917.0 | 1787 | CD34_D2T1 | 9277.0 | 3917.0 | 1787.0 | Cluster2 | -8.800932 | -1.228907 | Cluster8 | 08_GMP.Neut | AAACCTGAGTCGTTTG-1 | CD34 |
| CD34_32_R5:AAACCTGGTTCCACAA-1 | CD34 | 6023.0 | 2552 | CD34_D2T1 | 13073.0 | 6023.0 | 2552.0 | Cluster3 | -9.723482 | 7.335178 | Cluster1 | 01_HSC | AAACCTGGTTCCACAA-1 | CD34 |
| CD34_32_R5:AAACGGGAGCTTCGCG-1 | CD34 | 4493.0 | 2191 | CD34_D2T1 | 8412.0 | 4493.0 | 2191.0 | Cluster4 | -4.293071 | 5.692705 | Cluster6 | 06_CLP.1 | AAACGGGAGCTTCGCG-1 | CD34 |
| CD34_32_R5:AAACGGGAGGGAGTAA-1 | CD34 | 5190.0 | 2322 | CD34_D2T1 | 11914.0 | 5190.0 | 2322.0 | Cluster3 | -7.989706 | 9.108693 | Cluster1 | 01_HSC | AAACGGGAGGGAGTAA-1 | CD34 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| BMMC_10x_GREENLEAF_REP2:TTTGTTGAGTTAGTAG-1 | BMMC | 5240.0 | 2377 | BMMC_D1T2 | 7515.0 | 5240.0 | 2377.0 | Cluster9 | -5.708933 | -2.843953 | Cluster7 | 07_GMP | TTTGTTGAGTTAGTAG-1 | BMMC |
| BMMC_10x_GREENLEAF_REP2:TTTGTTGAGTTCATGC-1 | BMMC | 1338.0 | 882 | BMMC_D1T2 | 1597.0 | 1338.0 | 882.0 | Cluster16 | -2.258849 | -10.712415 | Cluster12 | 12_CD14.Mono.2 | TTTGTTGAGTTCATGC-1 | BMMC |
| BMMC_10x_GREENLEAF_REP2:TTTGTTGCAGTGGTGA-1 | BMMC | 1660.0 | 1004 | BMMC_D1T2 | 2831.0 | 1660.0 | 1004.0 | Cluster22 | 4.987181 | -8.794110 | Cluster19 | 19_CD8.N | TTTGTTGCAGTGGTGA-1 | BMMC |
| BMMC_10x_GREENLEAF_REP2:TTTGTTGCATGTGTCA-1 | BMMC | 1631.0 | 951 | BMMC_D1T2 | 2852.0 | 1631.0 | 951.0 | Cluster26 | 6.086861 | -3.454188 | Cluster20 | 20_CD4.N1 | TTTGTTGCATGTGTCA-1 | BMMC |
| BMMC_10x_GREENLEAF_REP2:TTTGTTGCATTGAAAG-1 | BMMC | 1665.0 | 963 | BMMC_D1T2 | 2759.0 | 1665.0 | 963.0 | Cluster12 | 1.101248 | 12.074113 | Cluster17 | 17_B | TTTGTTGCATTGAAAG-1 | BMMC |
35582 rows × 14 columns
# 数据标准化
# target_sum=1e4:目标总和,将每个细胞的总表达量标准化为 10000(1e4)
sc.pp.normalize_total(healthy, target_sum = 1e4) #Note this is only for cell annotation, recommended by authors but not best
# 对数变换,即 log(1 + x)
sc.pp.log1p(healthy)
ref2_model = celltypist.train(healthy, #这是经过预处理和标准化后的 AnnData 对象,包含了细胞的基因表达数据和细胞类型注释。
labels = 'BioClassification',
n_jobs = 22,#指定用于训练的并行处理的线程数。这里使用了 22 个线程以加速训练过程
use_SGD = False,#指定是否使用随机梯度下降 (SGD) 优化算法。如果设置为 False,则使用默认的优化算法
feature_selection = True,#指定是否进行特征选择。特征选择可以提高模型的性能和训练速度
top_genes = 300)#选择前 300 个基因作为特征 这个参数决定了用于模型训练的特征数量
🍳 Preparing data before training ✂️ 3368 non-expressed genes are filtered out 🔬 Input data has 35582 cells and 16919 genes ⚖️ Scaling input data 🏋️ Training data using SGD logistic regression 🔎 Selecting features 🧬 2860 features are selected 🏋️ Starting the second round of training 🏋️ Training data using logistic regression ✅ Model training done!
ref2_model.write('/home/mark/.celltypist/data/models/ref2.pkl')
Predict celltypist with celltypist models¶
After training and writing model, we need restart kernel to free memory and reload modules.
## loading the model we need
ref_model = models.Model.load(model="ref.pkl")
ref2_model = models.Model.load(model="ref2.pkl")
model_low = models.Model.load(model="Immune_All_Low.pkl")
def predict_cells(adata):
sc.pp.filter_genes(adata, min_cells = 10)# 过滤掉在少于 10 个细胞中表达的基因
sc.pp.normalize_total(adata, target_sum=1e4) #将每个细胞的总表达量归一化到 1e4(不推荐用于典型的预处理)。
sc.pp.log1p(adata)# 对数变换,即 log(1 + x)
# 将 AnnData 对象中的表达矩阵从稀疏格式转换为密集格式。这在处理大规模数据时可能会导致内存问题,所以请谨慎使用。
adata.X = adata.X.toarray()
# 使用第一个模型进行预测
predictions = celltypist.annotate(adata, model=model_low, majority_voting=False)
predictions_adata = predictions.to_adata()
adata.obs["low_label"] = predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
adata.obs["low_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]
# 使用第二个模型进行预测
predictions = celltypist.annotate(adata, model=ref_model, majority_voting=False)
predictions_adata = predictions.to_adata()
adata.obs["ref_label"] = predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
adata.obs["ref_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]
# 使用第三个模型进行预测
predictions = celltypist.annotate(adata, model=ref2_model, majority_voting=False)
predictions_adata = predictions.to_adata()
adata.obs["ref2_label"] = predictions_adata.obs.loc[adata.obs.index, "predicted_labels"]
adata.obs["ref2_score"] = predictions_adata.obs.loc[adata.obs.index, "conf_score"]
return adata.obs
在 celltypist.annotate 函数中的majority_voting 参数?
当设置为 False 时,celltypist 将返回每个细胞的概率预测分数。这意味着每个细胞将得到一个分数,表示其属于不同细胞类型的可能性。此方式更适合需要详细概率信息的场景。
当设置为 True 时,celltypist 将返回每个细胞的主要预测标签。即,对于每个细胞,它会选择最高概率的细胞类型作为最终预测结果。这种方式适合只需要最终标签而不关心具体概率的场景。
majority_voting=False 的应用场景:
1需要细胞类型预测的概率信息。
2进行后续分析时需要详细的概率分布。
3在一些分类不确定的情况下,进一步分析细胞的多种可能性。
majority_voting=True 的应用场景:
1只关心最终的细胞类型标签。
2在简化分析和结果展示时使用。
adatas = [sc.read_h5ad('pp_adata/' + x) for x in os.listdir('pp_adata')]
len(adatas)
75
# ad.copy(): 这个方法创建 ad 对象的副本。使用副本可以避免在原始数据上直接进行修改,保持adata为raw data。
predictions = [predict_cells(ad.copy()) for ad in adatas]
predictions = pd.concat(predictions)[['low_label', 'low_score', 'ref_label', 'ref_score', 'ref2_label', 'ref2_score']]
predictions
| low_label | low_score | ref_label | ref_score | ref2_label | ref2_score | |
|---|---|---|---|---|---|---|
| AGGCCACTCGAACCTA-1-AML19_REL | Neutrophil-myeloid progenitor | 0.030015 | cDC-like | 0.843493 | 07_GMP | 0.003871 |
| AACGTCACATGACGAG-1-AML19_REL | Neutrophil-myeloid progenitor | 0.227245 | Prog-like | 0.100215 | 05_CMP.LMPP | 0.999986 |
| CATGCCTGTCCGGTCA-1-AML19_REL | Mid erythroid | 0.993374 | lateEry | 0.999993 | 03_Late.Eryth | 1.000000 |
| CTCACTGAGATACAGT-1-AML19_REL | Macrophages | 0.204163 | cDC-like | 0.999983 | 07_GMP | 0.858804 |
| GATCGTAAGGCAGGTT-1-AML19_REL | Neutrophil-myeloid progenitor | 0.155923 | GMP-like | 0.039463 | 05_CMP.LMPP | 0.999619 |
| ... | ... | ... | ... | ... | ... | ... |
| GGTTAACTCAGACTGT-1-AML26_REL | Tem/Temra cytotoxic T cells | 0.984860 | NK | 1.000000 | 24_CD8.CM | 0.777919 |
| TCACGCTAGACATAAC-1-AML26_REL | CD16+ NK cells | 0.400283 | HSC-like | 0.003593 | 08_GMP.Neut | 0.032444 |
| TGTAAGCGTTGTGCCG-1-AML26_REL | DC | 0.105715 | Plasma | 0.009472 | 11_CD14.Mono.1 | 0.001033 |
| CGGGACTTCACCTCGT-1-AML26_REL | Classical monocytes | 0.347877 | HSC-like | 0.114122 | 11_CD14.Mono.1 | 0.000027 |
| CTCCGATTCTGGGTCG-1-AML26_REL | ETP | 0.066135 | HSC | 0.985449 | 02_Early.Eryth | 0.000099 |
300349 rows × 6 columns
# concat 函数将多个 AnnData 对象合并为一个 AnnData 对象
adata = sc.concat(adatas)
#将 predictions 数据框导出为 CSV 文件
predictions.to_csv('PREDICTIONS.csv')
adata
AnnData object with n_obs × n_vars = 300349 × 33538
obs: 'Patient', 'DX', 'Sample', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet', 'doublet_score'
adata.obs
| Patient | DX | Sample | n_genes | n_genes_by_counts | log1p_n_genes_by_counts | total_counts | log1p_total_counts | pct_counts_in_top_20_genes | pct_counts_mt | pct_counts_ribo | pct_counts_hb | doublet | doublet_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AGGCCACTCGAACCTA-1-AML19_REL | AML19 | REL | AML19_REL | 7473 | 7473 | 8.919186 | 67223.0 | 11.115786 | 15.725273 | 6.877111 | 24.531782 | 0.011901 | 0.0 | 1.145570e+01 |
| AACGTCACATGACGAG-1-AML19_REL | AML19 | REL | AML19_REL | 7105 | 7105 | 8.868695 | 64416.0 | 11.073133 | 17.413376 | 7.959203 | 25.009314 | 0.006210 | 0.0 | 7.357605e+00 |
| CATGCCTGTCCGGTCA-1-AML19_REL | AML19 | REL | AML19_REL | 6142 | 6142 | 8.723069 | 57582.0 | 10.960983 | 26.296412 | 5.861207 | 21.247959 | 15.508319 | 0.0 | 4.986384e+00 |
| CTCACTGAGATACAGT-1-AML19_REL | AML19 | REL | AML19_REL | 7125 | 7125 | 8.871505 | 54708.0 | 10.909784 | 19.775901 | 6.607809 | 20.391899 | 0.036558 | 0.0 | 1.145570e+01 |
| GATCGTAAGGCAGGTT-1-AML19_REL | AML19 | REL | AML19_REL | 6876 | 6876 | 8.835938 | 52846.0 | 10.875156 | 17.195247 | 6.764940 | 27.057866 | 0.009461 | 0.0 | 7.332305e+00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| GGTTAACTCAGACTGT-1-AML26_REL | AML26 | REL | AML26_REL | 1016 | 1016 | 6.924612 | 2232.0 | 7.711101 | 24.955197 | 4.480287 | 23.700717 | 0.000000 | 0.0 | 1.365787e-04 |
| TCACGCTAGACATAAC-1-AML26_REL | AML26 | REL | AML26_REL | 1100 | 1100 | 7.003974 | 2221.0 | 7.706163 | 24.358397 | 7.384061 | 23.277803 | 0.045025 | 0.0 | 1.822980e-11 |
| TGTAAGCGTTGTGCCG-1-AML26_REL | AML26 | REL | AML26_REL | 1221 | 1221 | 7.108244 | 2286.0 | 7.734996 | 18.197725 | 7.655293 | 14.216973 | 0.000000 | 0.0 | 1.822980e-11 |
| CGGGACTTCACCTCGT-1-AML26_REL | AML26 | REL | AML26_REL | 1198 | 1198 | 7.089243 | 2216.0 | 7.703910 | 20.532491 | 10.063177 | 10.379061 | 0.000000 | 0.0 | 1.822980e-11 |
| CTCCGATTCTGGGTCG-1-AML26_REL | AML26 | REL | AML26_REL | 1267 | 1267 | 7.145196 | 2222.0 | 7.706613 | 15.346535 | 5.535554 | 14.491449 | 0.045005 | 0.0 | 1.822980e-11 |
300349 rows × 14 columns
scVI label transfer¶
Reloaded ref date¶
healthy and rdata needs to be reloaded because it was normalized earlier !!!!!!!!!!
ref data1¶
# 遍历 ref_data/ 目录中的所有文件,读取包含 dem 和 anno 的文件合并,并存储在 rdatas 列表中。
rdatas = []
for dem in [x for x in os.listdir('ref_data/') if 'dem' in x]:
basename = dem.split('.')[0]
samp = basename.split('_')[1]
# 在 ref_data/ 目录中查找包含 'anno' 字符串并且与 dem 文件对应的注释文件。这里通过比较样本名称来找到匹配的注释文件
anno = [x for x in os.listdir('ref_data/') if samp == x.split('_')[1].split('.')[0] and 'anno' in x][0]
#读取表达矩阵,并转置
temp_data = sc.read_text('ref_data/' + dem).T
#读取注释文件
temp_anno = pd.read_table('ref_data/' + anno, index_col=0)[['CellType']]
#合并注释与表达矩阵
temp_data.obs = temp_data.obs.merge(right = temp_anno, left_index=True, right_index=True)
temp_data.obs['ID'] = basename
#将处理后的数据添加到列表中
rdatas.append(temp_data)
# scanpy 的 concat 函数会根据 obs 和 var 的索引来合并多个 AnnData 对象
rdata = sc.concat(rdatas)
#表示只保留那些在至少 10 个细胞中表达的基因
sc.pp.filter_genes(rdata, min_cells = 10)
# 过滤掉 rdata.obs 中 CellType 列值为 NaN(缺失值)的细胞
# ~:逻辑非运算符,反转 isna() 的布尔值
rdata = rdata[~rdata.obs.CellType.isna()]
ref data2¶
# 导入 anndata2ri 并激活
from anndata2ri import activate
activate()
# 加载 rpy2 扩展
%reload_ext rpy2.ipython
# 现在你可以在 Notebook 中使用 R 代码块了
C:\Users\Administrator\AppData\Local\Temp\ipykernel_21456\2347486068.py:3: DeprecationWarning: The global conversion available with activate() is deprecated and will be removed in the next major release. Use a local converter. activate() C:\ProgramData\anaconda3\envs\sc2024\lib\site-packages\rpy2\robjects\packages.py:367: UserWarning: The symbol 'quartz' is not in this R namespace/package. warnings.warn(
%%R
library(Seurat)
library(SummarizedExperiment)
WARNING: The R package "reticulate" only fixed recently
an issue that caused a segfault when used with rpy2:
https://github.com/rstudio/reticulate/pull/1188
Make sure that you use a version of that package that includes
the fix.
Attaching SeuratObject
载入需要的程辑包:MatrixGenerics
载入需要的程辑包:matrixStats
载入程辑包:'MatrixGenerics'
The following objects are masked from 'package:matrixStats':
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
colWeightedMeans, colWeightedMedians, colWeightedSds,
colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
rowWeightedSds, rowWeightedVars
载入需要的程辑包:GenomicRanges
载入需要的程辑包:stats4
载入需要的程辑包:BiocGenerics
载入程辑包:'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which.max, which.min
载入需要的程辑包:S4Vectors
载入程辑包:'S4Vectors'
The following objects are masked from 'package:base':
expand.grid, I, unname
载入需要的程辑包:IRanges
载入程辑包:'IRanges'
The following object is masked from 'package:grDevices':
windows
载入需要的程辑包:GenomeInfoDb
载入需要的程辑包:Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
载入程辑包:'Biobase'
The following object is masked from 'package:MatrixGenerics':
rowMedians
The following objects are masked from 'package:matrixStats':
anyMissing, rowMedians
载入程辑包:'SummarizedExperiment'
The following object is masked from 'package:SeuratObject':
Assays
The following object is masked from 'package:Seurat':
Assays
In addition: Warning messages:
1: 程辑包'Seurat'是用R版本4.2.3 来建造的
2: 程辑包'matrixStats'是用R版本4.2.3 来建造的
%%R -o healthy #将 R 变量 healthy 输出到 Python 环境中
rse <- readRDS("ref_data2/scRNA-Healthy-Hematopoiesis-191120.rds") #读取rds data
seurat_object <- CreateSeuratObject(counts = assay(rse, "counts"), meta.data = as.data.frame(colData(rse))) #create seurat object
healthy <- as.SingleCellExperiment(seurat_object) #将 Seurat 对象转换为 SingleCellExperiment 对象
In addition: Warning messages: 1: Layer ‘data’ is empty 2: Layer ‘scale.data’ is empty
Concatenating analysing data and ref data¶
根据'Sample', 'CellType', 'Batch'三列进行合并
# add 2 column in data of cell annotation
adata.obs['CellType'] = 'Unknown'
adata.obs['Batch'] = 'AML'
# modify healthy data of cell annotation
healthy.obs['CellType'] = healthy.obs['BioClassification']
healthy.obs['Batch'] = 'ref2'
healthy.obs['Sample'] = healthy.obs.index.map(lambda x: x.split(':')[0]) #将obs索引(列名),按:分割第一部分提取
lambda 函数是一种单行函数,可以用于需要临时函数的地方
使用 lambda 函数将列表中的每个元素乘以 2
numbers = [1, 2, 3, 4]
doubled = list(map(lambda x: x * 2, numbers))
print(doubled) 输出: [2, 4, 6, 8]
# modify ref data1 of cell annotation
rdata.obs['Batch'] = 'ref'
rdata.obs['Sample'] = rdata.obs['ID']
#合并多个 AnnData 对象
dater = sc.concat((adata, healthy, rdata))
# sc.pp.highly_variable_genes 方法用于标记高变异基因
sc.pp.highly_variable_genes(dater,
flavor = 'seurat_v3', #使用 Seurat v3 方法识别高变异基因
n_top_genes=2000, #选择前 2000 个变异度最高的基因
batch_key="Batch", #基于批次信息进行高变异基因识别
subset = True #仅保留高变异基因
)
为什么要将我的单细胞数据与refdata合并后,来识别高变基因?
1. 批次效应校正
通过将数据合并,可以在识别高变异基因时考虑批次效应,确保这些基因在不同批次之间具有一致的变异性。这有助于减少技术噪音的影响,使生物学信号更加明显。
2. 跨批次的基因选择
将数据与参考数据合并后识别高变异基因,可以确保所选择的基因在不同的数据集和批次中都是具有代表性的,从而提高分析结果的稳定性和可靠性。
3. 一致性验证
通过与参考数据合并,可以验证你数据中的高变异基因是否在其他数据集中也表现出高变异性。这有助于确保所识别的高变异基因具有生物学意义,而不仅仅是技术噪音。
dater
AnnData object with n_obs × n_vars = 376238 × 2000
obs: 'Sample', 'CellType', 'Batch'
var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
uns: 'hvg'
scVI predict¶
import scvi
scVI分析的话比较吃资源,如果有GPU的话速度会快5-10倍;如果没有GPU,仅用CPU运行scVI,还需谨慎。因为CPU模式的scVI运行速度实在是慢,特别是单细胞大样本数据的运行
但我们已经筛选了高变基因,速度会快很多
# train model
scvi.model.SCVI.setup_anndata(dater, batch_key='Batch', categorical_covariate_keys = ['Sample'])
vae = scvi.model.SCVI(dater)
vae.train()
Unable to initialize backend 'cuda':
Unable to initialize backend 'rocm': module 'jaxlib.xla_extension' has no attribute 'GpuAllocatorConfig'
Unable to initialize backend 'tpu': INTERNAL: Failed to open libtpu.so: libtpu.so: cannot open shared object file: No such file or directory
An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.
INFO: GPU available: True (cuda), used: True
GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
HPU available: False, using: 0 HPUs
INFO: You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Epoch 21/21: 100%|█| 21/21 [03:16<00:00, 9.27s/it, v_num=1, train_loss_step=587
INFO: `Trainer.fit` stopped: `max_epochs=21` reached. `Trainer.fit` stopped: `max_epochs=21` reached.
Epoch 21/21: 100%|█| 21/21 [03:16<00:00, 9.38s/it, v_num=1, train_loss_step=587
# predict celltype of unkonwn
lvae = scvi.model.SCANVI.from_scvi_model(vae, adata = dater, unlabeled_category = 'Unknown',
labels_key = 'CellType')
lvae.train(max_epochs=20,# 指定了最大的训练周期数
n_samples_per_label=100) # 指定了每个标签的样本数目,用于优化模型参数
# 在训练过程中,模型需要通过样本来更新其参数(例如神经网络的权重)。
# 指定每个标签的样本数目可以确保模型在学习过程中充分地利用每个类别的数据,从而更好地优化参数。
INFO Training for 20 epochs.
INFO: GPU available: True (cuda), used: True GPU available: True (cuda), used: True INFO: TPU available: False, using: 0 TPU cores TPU available: False, using: 0 TPU cores INFO: IPU available: False, using: 0 IPUs IPU available: False, using: 0 IPUs INFO: HPU available: False, using: 0 HPUs HPU available: False, using: 0 HPUs INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0] LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Epoch 20/20: 100%|█| 20/20 [07:25<00:00, 22.52s/it, v_num=1, train_loss_step=589
INFO: `Trainer.fit` stopped: `max_epochs=20` reached. `Trainer.fit` stopped: `max_epochs=20` reached.
Epoch 20/20: 100%|█| 20/20 [07:25<00:00, 22.28s/it, v_num=1, train_loss_step=589
dater.obs['predicted'] = lvae.predict(dater)
# 将 lvae 模型对 dater 数据集的预测结果中每个样本最高预测概率的值存储到 transfer_score 列中
# soft=True 表示返回的预测结果为概率分布
# max(axis=1):对于每个样本,取其预测概率分布中的最大值
# axis=1 的意思是沿着行的方向进行操作,即对每一行进行计算
dater.obs['transfer_score'] = lvae.predict(soft = True).max(axis = 1)
# 提取analysis data ,不要 ref data
dater = dater[dater.obs.Batch == 'AML']
dater
View of AnnData object with n_obs × n_vars = 300349 × 2000
obs: 'Sample', 'CellType', 'Batch', '_scvi_batch', '_scvi_labels', 'predicted', 'transfer_score'
var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
uns: 'hvg', '_scvi_uuid', '_scvi_manager_uuid'
obsm: '_scvi_extra_categorical_covs'
# 将scVI predict信息合并到adata中
adata.obs = adata.obs.merge(right = dater.obs[['predicted', 'transfer_score']], left_index=True, right_index=True)
# 将celltypist predict信息合并到adata中
adata.obs = adata.obs.merge(right = predictions, left_index=True, right_index=True)
# 输出数据
adata.write_h5ad('unintigrated.h5ad')
... storing 'CellType' as categorical ... storing 'Batch' as categorical ... storing 'predicted' as categorical ... storing 'low_label' as categorical ... storing 'ref_label' as categorical ... storing 'ref2_label' as categorical
Integration¶
we need restart kernel to free memory and reload modules.
Loading data and setting scVI model¶
What is the scVI model of deep learning?
模型结构如下图所示,将scRNA-seq数据中每个细胞作为一个样本,其基因表达作为特征,通过encoder的神经网络与重参数化将高维的基因表示压缩到低维隐空间(比如说10维);之后基于单细胞RNA测序数据基因表达量服从零膨胀负二项分布 (ZINB) 的假设,再利用decoder的神经网络将隐空间映射到基因表达分布参数的后验估计上, scvi的网络基本还是一个VAE的结构
# read data
adata = sc.read('unintigrated.h5ad')
adata
AnnData object with n_obs × n_vars = 300349 × 33538
obs: 'Patient', 'DX', 'Sample', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet', 'doublet_score', 'CellType', 'Batch', 'predicted', 'transfer_score', 'low_label', 'low_score', 'ref_label', 'ref_score', 'ref2_label', 'ref2_score'
from scvi.autotune import ModelTuner
# ModelTuner 用于自动调优 SCVI 模型的超参数。通过自动调优,可以找到最适合数据的模型参数组合,从而提高模型的性能
#ModelTuner 提供了一种简便的方法,通过指定搜索空间和资源配置,进行多次实验以找到最佳的模型参数
from ray import tune #tune 是 Ray 库中的一个强大的实验调度和超参数优化工具
##筛选只保留那些在至少 50 个细胞中表达的基因
sc.pp.filter_genes(adata, min_cells = 50)
adata
AnnData object with n_obs × n_vars = 300349 × 21156
obs: 'Patient', 'DX', 'Sample', 'n_genes', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'pct_counts_mt', 'pct_counts_ribo', 'pct_counts_hb', 'doublet', 'doublet_score', 'CellType', 'Batch', 'predicted', 'transfer_score', 'low_label', 'low_score', 'ref_label', 'ref_score', 'ref2_label', 'ref2_score'
var: 'n_cells'
#导入 SCVI 模型
model_cls = scvi.model.SCVI
#模型设置
model_cls.setup_anndata(adata, # AnnData 对象
categorical_covariate_keys = ['Sample'], # 设置那些为分类协变量;区分不同的样本并进行integration
continuous_covariate_keys=['pct_counts_mt', 'pct_counts_ribo']) #设置那些为连续性协变量
#始化了一个 ModelTuner 对象,并传入了模型类 model_cls
tuner = ModelTuner(model_cls)
An NVIDIA GPU may be present on this machine, but a CUDA-enabled jaxlib is not installed. Falling back to cpu.
What to use as the batch label?
最常见的方法是将每个样本定义为一个批次,这通常会产生最强的批次校正。 但是,样本通常会与您可能希望保留的生物因素相混淆。 例如,设想一个实验从组织中的两个位置采集样本。 如果将样本视为批次,那么数据整合方法将试图消除样本之间的差异,从而消除位置之间的差异。 在这种情况下,将捐献者作为批次来消除个体之间的差异可能更为合适
Why use pct_counts_mt and pct_counts_ribo as continuous_covariate_key to integration data?
线粒体基因表达量占比和核糖体基因表达量占比在不同批次之间确实可能存在差异。下面是一些可能导致这些差异的原因:
1. 线粒体基因表达量占比的差异原因
细胞质量:如果样品中的细胞质量较差,细胞破裂的可能性较大,线粒体基因表达量占比可能会升高。
样品处理:不同批次的样品在处理过程中可能存在差异,比如细胞裂解、RNA 提取效率等,这些都可能影响线粒体基因表达量占比。
技术噪音:实验设备、试剂和操作人员等技术因素也可能导致不同批次之间的差异。
2. 核糖体基因表达量占比的差异原因
细胞状态:细胞处于不同的生物学状态(如增殖、休眠)可能会影响核糖体基因的表达量。
实验条件:不同批次的实验条件(如培养基成分、温度、培养时间等)可能会影响细胞的核糖体基因表达。
技术变异:与线粒体基因表达量类似,技术噪音也可能导致不同批次之间的核糖体基因表达量占比差异。
Hyperparmeter Tuning¶
What is the hyperparmeter?
超参数则是在算法运行之前手动设置的参数,用于控制模型的行为和性能。
这些超参数的选择会影响到模型的训练速度、收敛性、容量和泛化能力等方面。
例如,学习率、迭代次数、正则化参数、隐藏层的神经元数量等都是常见的超参数。
超参数的选择通常是一个试错的过程,需要根据经验和领域知识进行调整。
# 查看模型
tuner.info()
ModelTuner registry for SCVI
Tunable hyperparameters ┏━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓ ┃ Hyperparameter ┃ Default value ┃ Source ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩ │ n_hidden │ 128 │ VAE │ │ n_latent │ 10 │ VAE │ │ n_layers │ 1 │ VAE │ │ dropout_rate │ 0.1 │ VAE │ │ dispersion │ gene │ VAE │ │ log_variational │ True │ VAE │ │ gene_likelihood │ zinb │ VAE │ │ latent_distribution │ normal │ VAE │ │ encode_covariates │ False │ VAE │ │ deeply_inject_covariates │ True │ VAE │ │ use_batch_norm │ both │ VAE │ │ use_layer_norm │ none │ VAE │ │ use_observed_lib_size │ True │ VAE │ │ var_activation │ None │ VAE │ │ optimizer │ Adam │ TrainingPlan │ │ lr │ 0.001 │ TrainingPlan │ │ weight_decay │ 1e-06 │ TrainingPlan │ │ eps │ 0.01 │ TrainingPlan │ │ n_steps_kl_warmup │ None │ TrainingPlan │ │ n_epochs_kl_warmup │ 400 │ TrainingPlan │ │ reduce_lr_on_plateau │ False │ TrainingPlan │ │ lr_factor │ 0.6 │ TrainingPlan │ │ lr_patience │ 30 │ TrainingPlan │ │ lr_threshold │ 0.0 │ TrainingPlan │ │ lr_min │ 0 │ TrainingPlan │ │ max_kl_weight │ 1.0 │ TrainingPlan │ │ min_kl_weight │ 0.0 │ TrainingPlan │ │ batch_size │ 128 │ SCVI │ └──────────────────────────┴───────────────┴──────────────┘
Available metrics ┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓ ┃ Metric ┃ Mode ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩ │ validation_loss │ min │ └─────────────────┴────────────┘
Default search space ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓ ┃ Hyperparameter ┃ Sample function ┃ Arguments ┃ Keyword arguments ┃ ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩ │ n_hidden │ choice │ [[64, 128]] │ {} │ └────────────────┴─────────────────┴─────────────┴───────────────────┘
The significance of each hyperparameter?
模型结构超参数(VAE):
1. n_hidden:
隐藏层的神经元数量。决定了每个隐藏层的大小
2. n_latent:
潜在空间的维度。潜在空间的维度通常比原始输入数据的维度低,这种降维可以捕捉数据的主要特征,同时滤除噪声。特征学习:通过学习一个低维的潜在表示,模型可以更好地理解和生成数据。
3. n_layers:
隐藏层的数量
4. dropout_rate:
用于防止过拟合,通过随机丢弃神经元来提高模型的泛化能力,并提高运行速度
5. dispersion:
分散参数用于描述数据的变异程度,通常有 "gene"和 "gene-batch":
gene:每个基因都有一个独立的分散参数。适用于基因表达变异性在不同细胞间变化较大的情况。
gene-batch:每个基因在每个批次中都有一个独立的分散参数。适用于不同批次间存在显著技术变异的情况。
gene-cell:每个基因在每个细胞中都有一个独立的分散参数。这种方式很少使用,因为它的参数量非常大,计算开销高且容易过拟合。通常不推荐使用。
6. log_variational:
是否对变分参数(方差或标准差)进行对数变换,保证方差或标准差为正值
7. gene_likelihood:
基因表达的似然分布。常见选择有 "zinb"(零膨胀负二项分布)
8. latent_distribution:
潜在空间的分布类型。通常是 "normal"(正态分布)
9. encode_covariates:
是否编码协变量。如果为 True,模型会将协变量编码进潜在空间,协变量通常指那些可能影响基因表达水平的额外变量,指前面分类协变量和连续协变量
10. deeply_inject_covariates:
深度注入协变量,是否将协变量不仅注入到输入层,还要注入到隐藏层,这有助于更全面地整合协变量信息,提高模型的性能,尤其是在存在复杂批次效应或其他混杂因素时
11. use_batch_norm:
批量归一化,指在每一训练批次(mini-batch)中对网络层的输入进行归一化处理。可以选择 "none"、"encoder(编码器)"、"decoder(解码器)"、"both"。
12. use_layer_norm :
是否使用层归一化。可以选择 "none"、"encoder"、"decoder"、"both"。对每一个样本的所有特征进行归一化,而不是对一个批次的所有样本进行归一化,适用于批次小的数据。
13. use_observed_lib_size:
库大小(library size) 是指每个细胞测序得到的总 RNA 片段数。由于测序深度的差异,不同细胞的库大小可能不同,这会影响基因表达量的比较。
use_observed_lib_size 参数用于指定在模型中是否使用实际观察到的库大小进行归一化。
14. var_activation:
指激活函数。可以选择不同的激活函数来调整模型输出的范围。
在 SCVI 模型中,var_activation 可以设置为以下几种常用的激活函数:
None:默认值,表示不应用额外的激活函数,直接使用线性计算。
"softplus":定义:Softplus 函数是一个平滑的近似 ReLU 函数,定义为 softplus(𝑥)=log(1+exp(𝑥)),输出为正值
"exp":定义:函数exp(𝑥)=e^x。作用:输出为正值。
训练超参数(TrainingPlan):
训练超参数(TrainingPlan)用于定义模型的训练过程,包括优化器设置、学习率调节、正则化参数等
1. optimizer:
优化器类型。常见选择有:
Adam——自适应调整学习率,刚开始学习率比较大,后面通过缩减学习率来,从此增加学习速度与到达最低点
SGD——随机梯度下降
RMSprop——Adagrad 算法通过累积历史梯度的平方来调整每个参数的学习率,从而实现对频繁更新参数的惩罚和对不频繁更新参数的鼓励。然而,Adagrad 也存在一个问题:随着参数更新的累积,学习率会越来越小,最终导致学习过程提前结束。RMSProp 通过引入一个衰减系数来解决这个问题,使得历史信息能够指数级衰减,从而避免了学习率持续下降的问题
2. lr:
学习率。控制模型参数更新的步伐。
3. weight_decay:
权重衰减率,过给损失函数增加模型权重L2范数的惩罚(penalty)来让模型权重不要太大,以此来减小模型的复杂度,从而抑制模型的过拟合
4. eps:
一个非常小的正数,用于防止除零错误和增加数值稳定性
5. n_steps_kl_warmup:
KL 散度预热的步骤数。在变分自编码器(VAE)中,损失函数由重构误差和 KL 散度两部分组成。KL 散度衡量潜在分布与先验分布之间的差异。在训练的初期,如果 KL 散度权重较高,可能导致模型的学习不稳定。因此,通过 KL 散度预热,可以在训练的早期阶段逐步增加 KL 散度的权重,使模型更平稳地过渡到考虑 KL 散度的完整损失函数。
6. n_epochs_kl_warmup:
KL 散度预热的 epoch 数。逐步增加 KL 散度权重。
7. reduce_lr_on_plateau:
是否在性能不提升时减少学习率。
8. lr_factor:
学习率减少因子。在性能不提升时减少学习率的比例。
9. lr_patience:
学习率减少的耐心度。在减少学习率之前等待的 epoch 数。
10. lr_threshold:
学习率减少的阈值。在性能变化小于该值时减少学习率。
11.lr_min:
最小学习率。
12. max_kl_weight:
KL散度的最大权重。控制 KL 散度损失的权重。
13. min_kl_weight:
KL散度的最小权重。
SCVI 特定超参数:
1. batch_size:
批量大小。控制每次训练迭代所使用的样本数量。
# 定义一个包含多个超参数及其取值范围的搜索空间,可以帮助 ModelTuner 自动优化模型配置,定义的范围越多,所需要时间越长
search_space = {
"n_hidden": tune.choice([92, 128, 192, 256]),
"n_latent": tune.choice([10, 20, 30, 40, 50, 60]),
"n_layers": tune.choice([1, 2, 3]),
"lr": tune.loguniform(1e-4, 1e-2),
"gene_likelihood": tune.choice(["nb", "zinb"])} #基因类似分布。nb为二项分布,zinb为零膨胀负二项分布
# 进行超参数优化,运行时间非常长,因为要进行多次训练
results = tuner.fit(adata,
metric="validation_loss", #用于评估模型性能的指标,这里选择验证集损失(validation_loss)
#验证数据集上计算的损失值,用于评估模型在未见数据上的性能。较低的验证损失通常表示模型在验证数据集上表现较好,具有更好的泛化能力。
resources = {'gpu': 1}, #have to specify gpu or might not use,指定资源配置,这里指定使用 1 个 GPU
search_space = search_space, #定义超参数搜索空间
num_samples = 100, #尝试不同超参数配置的样本数量,数目越多,得出的超参数越精确,所需要的时间越长
max_epochs = 20) # 每次尝试的最大训练周期数,后面真实训练,训练周期会很大,但这里只是寻找最优超参数,不需要设置很大
# 其得出一份不同超参数的训练结果,我们只需选择"validation_loss"最低的超参数
#遍历所有超参数调优的结果
best_vl = 10000 #设置一个validation_loss初始值
best_i = 0
for i, res in enumerate(results.results):
vl = res.metrics['validation_loss']
# 比较和更新validation_loss
if vl < best_vl:
best_vl = vl
best_i = i
# 获取validation_loss最低的配置详细信息
results.results[best_i]
Result( metrics={'validation_loss': 5222.72509765625}, path='/home/mark/ray_results/2024-04-14_00-09-20_scvi/_trainable_3c312990_57_gene_likelihood=zinb,lr=0.0026,n_hidden=256,n_latent=10,n_layers=3_2024-04-14_01-20-32', filesystem='local', checkpoint=None )
Training scVI model¶
#模型设置
model_cls.setup_anndata(adata, # AnnData 对象
categorical_covariate_keys = ['Sample'], # 设置那些为分类协变量;区分不同的样本并进行integration
continuous_covariate_keys=['pct_counts_mt', 'pct_counts_ribo']) #设置那些为连续性协变量
# 输入最优超参数
model = scvi.model.SCVI(adata, n_hidden = 256, n_latent = 10, n_layers = 3, gene_likelihood = 'zinb')
# 设置学习率
kwargs = {'lr': 0.0026}
model.train(max_epochs = 200, #指定最大训练周期数为 200
early_stopping = True, #启用早停机制。如果验证集的性能在若干个周期内没有改善,训练将提前停止,以防止过拟合
plan_kwargs = kwargs)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Epoch 5/200: 2%| | 4/200 [01:05<53:22, 16.34s/it, v_num=1, train_loss_step=5.6
# 保存训练好的模型
model.save('the_model')
#加载保存的模型,并将其与新的或相同的 AnnData 对象关联
model = scvi.model.SCVI.load('the_model/', adata)
INFO File the_model/model.pt already downloaded
# 获取模型训练过程中验证集上的最小重构损失
y = model.history['reconstruction_loss_validation']['reconstruction_loss_validation'].min()
plt.plot(model.history['reconstruction_loss_train']['reconstruction_loss_train'], label='train')
plt.plot(model.history['reconstruction_loss_validation']['reconstruction_loss_validation'], label='validation')
# plt.axhline(y):在 最小重构损失y 值所在的高度添加一条水平线。c='k':设置线条颜色为黑色('k' 表示黑色)。
plt.axhline(y, c = 'k')
plt.legend() #显示图例
plt.show()
可以看出该模型很快到达平台期,并在训练周期在75的时候会得到一个最小重构损失。
但为防止过拟合,会选择一个较小的训练周期及早停止,如上述第5个周期的时候停止
如果未出现plateaued,可以尝试提高训练周期
# 将 AnnData 对象保存为 .h5ad 文件
adata.write_h5ad('temp.h5ad')
Dim reduction and clustering¶
umap注意事项
单细胞umap上的2个细胞不是直线距离最短,就像地球平面图一样
umap上的细胞只是把他们按不同的cluster分开而已。
所以并不是2个cluster靠近,说明他们为相似的细胞。
# 将模型的潜在表示(latent representation)保存到 AnnData 对象的 .obsm 属性中,可以方便后续的分析和可视化
# 潜在表示(latent representation)是指数据经过模型(例如变分自编码器 VAE)处理后,在低维空间中的表示
adata.obsm['X_scVI'] = model.get_latent_representation()
# 查看期潜在表示形状
adata.obsm['X_scVI'].shape
(300349, 10)
300349:这是样本的数量,数据集中有 300349 个细胞
10:这是潜在空间的维度,每个细胞在这个 10 维空间中有一个对应的表示
# 用'X_scVI'参数计算k近邻图
# k近邻图是一种图结构,每个节点(细胞)连接到其 k个最近的邻居
sc.pp.neighbors(adata, use_rep = 'X_scVI')
# 基于先前计算的 k 近邻图进行聚类。
# Leiden 算法是一种基于图的聚类算法,通常用于单细胞 RNA 测序数据的聚类分析
sc.tl.leiden(adata,
resolution = 3, #设置聚类的分辨率参数,较高的值会生成更多的聚类
key_added = 'overcluster') #指定将聚类结果保存到 AnnData 对象中的 obs 属性中,键名为 'overcluster'。
Leiden 算法:https://www.jianshu.com/p/9770c0a79d38
# umap可视化,会根据邻近图计算 UMAP嵌入
sc.tl.umap(adata)
# 将 adata.X 复制到 adata.layers['counts'] 是为了在进行数据标准化或其他预处理步骤之前保留原始计数数据
adata.layers['counts'] = adata.X.copy()
在尝试了30多种不同scale data 的方法,发现Delta Method方法效果最好
# 数据标准化及对数变换,即 log(1 + x)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
# 保存数据
adata.write_h5ad('temp.h5ad')
adata.obs['low_major'] = adata.obs.groupby('overcluster')['low_label'].transform(lambda x: x.mode()[0])
# adata.obs.groupby('overcluster'):按 overcluster 列对数据进行分组
# ['low_label']:选择分组后的 low_label 列。
# .transform(lambda x: x.mode()[0]):对每个分组应用 transform 方法,计算 low_label 列的众数(mode),并返回每个分组的众数值。
# 众数? 即为聚类后,在这个cluster细胞最多那一种
sc.pl.umap(adata, color = ['low_major'], s = 5) #s=5:设置每个点的大小为 5
adata.obs['predicted_major'] = adata.obs.groupby('overcluster')['predicted'].transform(lambda x: x.mode()[0])
sc.pl.umap(adata, color = ['predicted_major'], legend_loc = 'on data', s = 5)
adata.obs['ref_major'] = adata.obs.groupby('overcluster')['ref_label'].transform(lambda x: x.mode()[0])
sc.pl.umap(adata, color = ['ref_label'], s = 5)
Annotation¶
Preparation for annotation¶
# 查看之前不同celltypist models预测得分
# 其中黄色代表高可信度,可以看出ref预测出来右边的数据比较可信
sc.pl.umap(adata, color = ['low_score', 'ref_score', 'ref2_score' ])
ref data与query data必须具有相关性!!!
如果ref data与query data不为同一器官,可导致注释错误,并且其confidence score还会比较高的情况!!!!
错误的细胞注释,会引发严重的后果!!!!
# 可以看出我们三个模型得出的整体confidence score都不是很高
# 我们把不同的cluster分出来,进行手动注释
sc.pl.umap(adata, color = ['overcluster'], legend_loc = 'on data', s = 5)
# 生成一个新的的AnnData的umap图,包含相同的细胞和基因数据,但细胞的顺序已经被随机打乱。
# 这在许多分析和可视化任务中可能非常有用,例如生成随机样本、验证模型的稳定性或防止某些排序偏差影响结果。
## 设置随机种子
# 这确保了每次运行代码时,生成的随机数序列都是相同的,便于结果的复现。
np.random.seed(1)
# 随机排列数据索引
# adata.shape 返回一个包含数据维度的元组 (n_obs, n_vars),其中 n_obs 是细胞的数量,n_vars 是基因的数量。
# adata.shape[0]:获取 adata 中细胞的数量。
# range(n) 生成一个从 0 到 n-1 的整数序列
# np.random.permutation:随机打乱这个序列,生成一个随机排列的索引数组
ri = np.random.permutation(list(range(adata.shape[0])))
# 绘制 UMAP 图
sc.pl.umap(adata[ri,:], color = ['DX'], vmin = .5, size = 2)
DX:首次被诊断为急性髓系白血病时的状态
REL:这是指患者在治疗后,疾病得到缓解后又再次复发的状态
REM:缓解期是指经过有效治疗后,患者的AML症状得到显著改善
get blast marker gene from other papers
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9506284¶
https://www.pnas.org/doi/pdf/10.1073/pnas.2003900117¶
aml_marks = ['CD33', 'CD44', 'CD47', 'BST1', 'CD244', 'HAVCR2', 'CLEC12A', 'IL1RAP', 'MPO', 'AZU1', 'ELANE', 'PRTN3',
'CLEC11A', 'PRAME', 'NREP', 'ARMH1', 'C1QBP', 'TRH']
# 对marker基因进行打分
sc.tl.score_genes(adata, aml_marks, score_name = 'AML_blast_score')
#'AZU1','ELANE','MPO'三个为Blast cells的marker gene
# "Blast cells"(原始细胞)是指尚未完全分化的幼稚细胞。在急性髓系白血病(AML)中,原始细胞(髓系原始细胞)在骨髓中大量积累,并抑制正常血细胞的生成
# 'AZU1','ELANE','MPO'改三个基因在DX和REX细胞比较富集,比较符合要求
sc.pl.umap(adata, color = ['AZU1', 'ELANE', 'MPO'], s = 5)
# 我们查看一下Blast cells的得分,发现得分高与DX和REX overlap,比较符合要求
sc.pl.umap(adata, color = ['AML_blast_score'], s = 5)
# 计算每个“overcluster”类别的 AML_blast_score 的中位数
# adata.obs[['overcluster', 'AML_blast_score']]:从 adata.obs 数据框中选择 overcluster 和 AML_blast_score 列
# groupby('overcluster'):根据 overcluster 列对数据进行分组
# .median():对每个组计算 AML_blast_score 列的中位数
aml_blast_scores = adata.obs[['overcluster', 'AML_blast_score']].groupby('overcluster').median()
#创建一个大小为16x4的绘图窗口
plt.figure(figsize = (16, 4))
#绘制条形图
sns.barplot(aml_blast_scores, y = 'AML_blast_score', x = aml_blast_scores.index)
<Axes: xlabel='overcluster', ylabel='AML_blast_score'>
# 提通过对overcluster进行分组来。计算每个组中low_label、ref_label、ref2_label和predicted列的众数(mode)
labels = adata.obs[['low_label', 'ref_label', 'ref2_label', 'predicted','overcluster']].groupby('overcluster').agg(lambda x: x.mode())
# 提通过对overcluster进行分组来。计算每个组中low_label、ref_label、ref2_label和predicted列的均值(mean)
scores = adata.obs[['low_score', 'ref_score', 'ref2_score','transfer_score','overcluster']].groupby('overcluster').agg(lambda x: x.mean())
#合并数据
mapping_res = labels.merge(right = scores, left_index=True, right_index=True)
mapping_res[40:50]
| low_label | ref_label | ref2_label | predicted | low_score | ref_score | ref2_score | transfer_score | |
|---|---|---|---|---|---|---|---|---|
| overcluster | ||||||||
| 40 | Tcm/Naive helper T cells | T | 21_CD4.N2 | CD4.N1 | 0.964791 | 0.980608 | 0.279939 | 0.914714 |
| 41 | Mid erythroid | lateEry | 03_Late.Eryth | Late.Eryth | 0.811064 | 0.996394 | 0.998696 | 0.969359 |
| 42 | Mid erythroid | lateEry | 03_Late.Eryth | Late.Eryth | 0.782480 | 0.775955 | 0.886610 | 0.971583 |
| 43 | Neutrophil-myeloid progenitor | GMP-like | 08_GMP.Neut | GMP.Neut | 0.483528 | 0.691535 | 0.532164 | 0.925160 |
| 44 | Classical monocytes | HSC-like | 12_CD14.Mono.2 | B | 0.702154 | 0.544124 | 0.449953 | 0.985155 |
| 45 | Classical monocytes | cDC-like | 12_CD14.Mono.2 | cDC-like | 0.320454 | 0.357452 | 0.123151 | 0.915570 |
| 46 | Non-classical monocytes | cDC-like | 13_CD16.Mono | CD16.Mono | 0.967654 | 0.700417 | 0.704498 | 0.994551 |
| 47 | Tcm/Naive helper T cells | lateEry | 03_Late.Eryth | Late.Eryth | 0.528499 | 0.856315 | 0.864205 | 0.935238 |
| 48 | Classical monocytes | cDC-like | 17_B | GMP | 0.470039 | 0.592594 | 0.205825 | 0.923858 |
| 49 | Classical monocytes | Mono-like | 12_CD14.Mono.2 | CD14.Mono.2 | 0.916921 | 0.847386 | 0.735044 | 0.959181 |
用scvi得出的transferscore会比celltypist预测得得分整体会高一些
前三个分数来源自logistics regression, 而transfer_score通常会更高
# 计算每个'overcluster的差异表达基因
sc.tl.rank_genes_groups(adata, groupby = 'overcluster')
# 提取差异表达基因结果
marks = sc.get.rank_genes_groups_df(adata, group = None)
marks
| group | names | scores | logfoldchanges | pvals | pvals_adj | |
|---|---|---|---|---|---|---|
| 0 | 0 | HLA-B | 236.742691 | 1.942523 | 0.0 | 0.0 |
| 1 | 0 | NKG7 | 220.726761 | 5.321103 | 0.0 | 0.0 |
| 2 | 0 | CCL5 | 200.138092 | 5.461651 | 0.0 | 0.0 |
| 3 | 0 | HLA-C | 200.065216 | 2.085589 | 0.0 | 0.0 |
| 4 | 0 | B2M | 165.315521 | 1.427652 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... |
| 1396291 | 65 | A1BG | -258.354767 | -27.394867 | 0.0 | 0.0 |
| 1396292 | 65 | SYTL1 | -261.925568 | -27.513292 | 0.0 | 0.0 |
| 1396293 | 65 | ALOX5AP | -275.546906 | -28.285286 | 0.0 | 0.0 |
| 1396294 | 65 | SAMSN1 | -294.376343 | -28.066748 | 0.0 | 0.0 |
| 1396295 | 65 | SLC38A1 | -295.444458 | -28.156713 | 0.0 | 0.0 |
1396296 rows × 6 columns
一个很好的marker基因数据库 https://panglaodb.se/
For example:annotation for cluster 59¶
标记cluster59 location
# 绘制所有细胞的 UMAP 图
ax = sc.pl.umap(adata,
palette = 'lightgrey', #将所有细胞的颜色设为浅灰色
show = False #show=False:不立即显示图像,方便后续叠加其他图层
)
# 叠加特定 overcluster 细胞的 UMAP 图
sc.pl.umap(adata[adata.obs.overcluster == '59'],
color = 'overcluster', #根据 overcluster 列进行着色
ax = ax, #在之前绘制的图像对象上进行叠加绘制
legend_loc = None,
palette = 'k' #将特定 overcluster 细胞的颜色设为黑色
)
查看cluster59 AML分型占比
# 按 'overcluster' 和 'DX' 列进行分组,并计算每个组的大小
_ = adata.obs.groupby(['overcluster', 'DX']).size().reset_index()
# 过滤出 'overcluster' 等于 '59' 的行
_[_.overcluster == '59']
| overcluster | DX | 0 | |
|---|---|---|---|
| 177 | 59 | DX | 205 |
| 178 | 59 | REL | 328 |
| 179 | 59 | REM | 336 |
查看cluster59 原始细胞得分
aml_blast_scores[aml_blast_scores.index == '59']
| AML_blast_score | |
|---|---|
| overcluster | |
| 59 | -0.050902 |
查看cluster59 模型预测得分
mapping_res[59]
查看cluster59 前10的差异基因
marks[(marks.group == '59')].head(10)
| group | names | scores | logfoldchanges | pvals | pvals_adj | |
|---|---|---|---|---|---|---|
| 1248204 | 59 | TUBA1B | 85.918053 | 3.541741 | 0.000000e+00 | 0.000000e+00 |
| 1248205 | 59 | H2AFZ | 78.306709 | 2.916618 | 0.000000e+00 | 0.000000e+00 |
| 1248206 | 59 | PCLAF | 75.170197 | 4.453035 | 0.000000e+00 | 0.000000e+00 |
| 1248207 | 59 | TUBB | 74.742096 | 3.131462 | 0.000000e+00 | 0.000000e+00 |
| 1248208 | 59 | CD74 | 68.952667 | 2.297455 | 0.000000e+00 | 0.000000e+00 |
| 1248209 | 59 | HMGN2 | 66.126389 | 2.594371 | 0.000000e+00 | 0.000000e+00 |
| 1248210 | 59 | GAPDH | 62.795666 | 1.316903 | 0.000000e+00 | 0.000000e+00 |
| 1248211 | 59 | STMN1 | 61.278427 | 3.630430 | 2.703942e-318 | 5.500442e-316 |
| 1248212 | 59 | SLC25A5 | 58.257256 | 1.580897 | 1.406951e-304 | 2.681572e-302 |
| 1248213 | 59 | HMGB2 | 57.884323 | 3.023342 | 3.698718e-301 | 6.745696e-299 |
For example:annotation cluster based on marker genes¶
查看marker gene的分布
sc.pl.umap(adata, color = ['CSF3R', 'S100A8'], legend_loc = 'on data', s = 5)
查看marker基因在差异基因的得分与logFC,并按logFC进行降序排序
marks[(marks.names == 'TPSAB1') ].sort_values('logfoldchanges', ascending = False).head()
| group | names | scores | logfoldchanges | pvals | pvals_adj | |
|---|---|---|---|---|---|---|
| 211565 | 10 | TPSAB1 | 87.712479 | 5.592444 | 0.000000e+00 | 0.000000e+00 |
| 84714 | 4 | TPSAB1 | 52.446213 | 2.482129 | 0.000000e+00 | 0.000000e+00 |
| 296333 | 14 | TPSAB1 | 42.197929 | 2.235497 | 0.000000e+00 | 0.000000e+00 |
| 1121485 | 53 | TPSAB1 | 14.254958 | 1.796557 | 7.824700e-43 | 8.411552e-42 |
| 21456 | 1 | TPSAB1 | 39.399048 | 1.649755 | 0.000000e+00 | 0.000000e+00 |
Fill in annotation for each cluster and visualize them¶
over2cell = {"0":"NK",
"1":"AML-blast",
"2":"Th",
"3":"Th",
"4":"AML-blast",
"5":"B",
"6":"Mono",
"7":"AML-blast",
"8":"AML-blast",
"9":"Tc",
"10":"Mast",
"11":"Tc",
"12":"Th",
"13":"Mono",
"14":"AML-blast",
"15":"AML-blast",
"16":"AML-blast",
"17":"AML-blast",
"18":"Mono",
"19":"AML-blast",
"20":"AML-blast",
"21":"AML-blast",
"22":"AML-blast",
"23":"DC",
"24":"AML-blast",
"25":"AML-blast",
"26":"AML-blast",
"27":"AML-blast",
"28":"AML-blast",
"29":"AML-blast",
"30":"Mono",
"31":"AML-blast",
"32":"Mono",
"33":"AML-blast",
"34":"ProB",
"35":"AML-blast",
"36":"AML-blast",
"37":"pDC",
"38":"AML-blast",
"39":"ProB",
"40":"Th",
"41":"erythroid",
"42":"erythroid",
"43":"AML-blast",
"44":"Neutrophil",
"45":"AML-blast",
"46":"NC-Mono",
"47":"erythroid",
"48":"AML-blast",
"49":"Mono",
"50":"AML-blast",
"51":"AML-blast",
"52":"AML-blast",
"53":"AML-blast",
"54":"Th",
"55":"Tc",
"56":"NK",
"57":"HSC-like",
"58":"AML-blast",
"59":"Unk", #at the end of the day, there may be some small clusters that are almost impossible to annotate
"60":"PreB",
"61":"AML-blast",
"62":"AML-blast",
"63":"AML-blast",
"64":"Platelets",
"65":"Unk2"}
# 将注释信息引入Anndata对象
adata.obs['CellType'] = adata.obs.overcluster.map(over2cell)
# 可视化
sc.pl.umap(adata, color = ['CellType'], s = 2, legend_loc = 'on data')
# 保存数据
adata.write_h5ad('annotated.h5ad')
Custom Drawing
# 读取数据
adata = sc.read_h5ad('annotated.h5ad')
# 设置 scanpy 绘图参数:
sc.set_figure_params(dpi_save=600, #设置保存图像的分辨率为 600 DPI
transparent=True) #设置图像背景为透明
# 导入rc_context来临时更改图形的配置参数
from matplotlib.pyplot import rc_context
# 过滤掉包含 'Unk' 的细胞类型
adata = adata[~adata.obs.CellType.str.contains('Unk')]
with rc_context({"figure.figsize": (8, 8)}):
sc.pl.umap(adata,
color = ['CellType'],
s = 10, #设置散点大小
legend_loc = 'on data',
legend_fontsize=14, #设置图例字体大小
legend_fontoutline=2, #设置图例字体轮廓宽度
frameon=False, #不显示图框
title = '', #设置图标题为空
save = 'cells.png')
WARNING: saving figure to file figures/umapcells.png
with rc_context({"figure.figsize": (8, 8)}):
sc.pl.umap(adata, color = ['overcluster'], s = 10, legend_loc = None,
legend_fontsize=14,
legend_fontoutline=2,
frameon=False, title = '', save = 'clusters.png')
WARNING: saving figure to file figures/umapclusters.png